1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import base64
import codecs
import re
import zipfile
import chardet
import toutiao_dmp_pb2
PATTERNS = {
0: u'^[a-zA-Z0-9]{15}$',
1: u'^[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$',
2: u'^\d+$',
3: u'^1[34578]{1}\d{9}$',
4: u'^[a-zA-Z0-9]{32}$',
5: u'^[a-zA-Z0-9]{32}$',
6: u'^[a-fA-F0-9]{64}$',
}
def validate_id_format(data_type, id_data):
reg_pattern = PATTERNS.get(data_type)
if reg_pattern:
return re.match(reg_pattern, id_data) is not None
else:
return False
def validate(file):
zip_file = zipfile.ZipFile(file)
valid_num = 0
invalid_num = 0
total_num = 0
for inside_file in zip_file.namelist():
with zip_file.open(inside_file, 'rU') as f:
encoding = chardet.detect(f.peek()).get('encoding')
# print encoding
decoded_file = codecs.iterdecode(f, encoding, errors='ignore')
for data_line in decoded_file:
data_line = data_line.strip()
data_line = base64.b64decode(data_line)
dmp_data = toutiao_dmp_pb2.DmpData()
dmp_data.ParseFromString(data_line)
for id_item in dmp_data.idList:
if not validate_id_format(id_item.dataType, id_item.id):
# print 'invaild item:',
# print id_item
invalid_num += 1
else:
# print 'vaild item:',
# print id_item
valid_num += 1
total_num += len(id_item.tags)
# print 'valid_num: %s' % valid_num
# print 'invalid_num: %s' % invalid_num
return valid_num, invalid_num, total_num
"""
# zip_file = gzip.ZipFile('/Users/wangjf/Workspace/data/output/part-00000.gz')
valid_num = 0
invalid_num = 0
path = "/Users/wangjf/Workspace/data/output_1/part-00000.gz"
if os.path.exists(path):
with gzip.open(path, 'r') as f:
decoded_file = codecs.iterdecode(f, 'utf8', errors='ignore')
for data_line in decoded_file:
data_line = data_line.strip()
data_line = base64.b64decode(data_line)
dmp_data = toutiao_dmp_pb2.DmpData()
dmp_data.ParseFromString(data_line)
print dmp_data.idList
for id_item in dmp_data.idList:
if not validate_id_format(id_item.dataType, id_item.id):
print 'invaild item:',
print id_item
invalid_num += 1
else:
print 'vaild item:',
print id_item
valid_num += 1
print 'valid_num: %s' % valid_num
print 'invalid_num: %s' % invalid_num
"""
'''
if __name__ == '__main__':
main()
'''