#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import base64
import codecs
import re
import zipfile

import chardet

import toutiao_dmp_pb2

PATTERNS = {
	0: u'^[a-zA-Z0-9]{15}$',
	1: u'^[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$',
	2: u'^\d+$',
	3: u'^1[34578]{1}\d{9}$',
	4: u'^[a-zA-Z0-9]{32}$',
	5: u'^[a-zA-Z0-9]{32}$',
	6: u'^[a-fA-F0-9]{64}$',
}


def validate_id_format(data_type, id_data):
	reg_pattern = PATTERNS.get(data_type)
	if reg_pattern:
		return re.match(reg_pattern, id_data) is not None
	else:
		return False


def validate(file):
	zip_file = zipfile.ZipFile(file)
	valid_num = 0
	invalid_num = 0
	total_num = 0
	for inside_file in zip_file.namelist():
		with zip_file.open(inside_file, 'rU') as f:
			encoding = chardet.detect(f.peek()).get('encoding')
			# print encoding
			decoded_file = codecs.iterdecode(f, encoding, errors='ignore')
			for data_line in decoded_file:
				data_line = data_line.strip()
				data_line = base64.b64decode(data_line)
				dmp_data = toutiao_dmp_pb2.DmpData()
				dmp_data.ParseFromString(data_line)
				for id_item in dmp_data.idList:
					if not validate_id_format(id_item.dataType, id_item.id):
						# print 'invaild item:',
						# print id_item
						invalid_num += 1
					else:
						# print 'vaild item:',
						# print id_item
						valid_num += 1
						total_num += len(id_item.tags)
	# print 'valid_num: %s' % valid_num
	# print 'invalid_num: %s' % invalid_num
	return valid_num, invalid_num, total_num

	"""
	# zip_file = gzip.ZipFile('/Users/wangjf/Workspace/data/output/part-00000.gz')

	valid_num = 0
	invalid_num = 0
	path = "/Users/wangjf/Workspace/data/output_1/part-00000.gz"
	if os.path.exists(path):
		with gzip.open(path, 'r') as f:
			decoded_file = codecs.iterdecode(f, 'utf8', errors='ignore')
			for data_line in decoded_file:
				data_line = data_line.strip()
				data_line = base64.b64decode(data_line)
				dmp_data = toutiao_dmp_pb2.DmpData()
				dmp_data.ParseFromString(data_line)
				print dmp_data.idList
				for id_item in dmp_data.idList:
					if not validate_id_format(id_item.dataType, id_item.id):
						print 'invaild item:',
						print id_item
						invalid_num += 1
					else:
						print 'vaild item:',
						print id_item
						valid_num += 1
	print 'valid_num: %s' % valid_num
	print 'invalid_num: %s' % invalid_num
	"""


'''
if __name__ == '__main__':
    main()
'''