40 lines
1.5 KiB
Python
40 lines
1.5 KiB
Python
|
import json
|
||
|
import gzip
|
||
|
import glob
|
||
|
|
||
|
|
||
|
|
||
|
def validate(input_path="metadata") :
|
||
|
error = {}
|
||
|
valid = 0
|
||
|
total = 0
|
||
|
error_record = 0
|
||
|
for i in glob.glob(f"{input_path}/dump*.gz"):
|
||
|
with gzip.open(i) as f:
|
||
|
for line in f:
|
||
|
data = json.loads(line.decode("utf-8"))
|
||
|
e = False
|
||
|
if "deposition_date" not in data or data['deposition_date'] is None:
|
||
|
error['MissingDate'] = error.get('MissingDate', 0)+1
|
||
|
e= True
|
||
|
if 'pmid' not in data and 'doi' not in data:
|
||
|
error['MissingLinks'] = error.get('MissingLinks', 0)+1
|
||
|
e= True
|
||
|
if 'authors' not in data or len(data['authors']) <1:
|
||
|
error['MissingAuthor'] = error.get('MissingAuthor', 0)+1
|
||
|
e= True
|
||
|
if "title" not in data or data['title'] is None:
|
||
|
error['MissingTitle'] = error.get('MissingTitle', 0)+1
|
||
|
e= True
|
||
|
if 'Keywords' not in data or data['Keywords'] is None:
|
||
|
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
|
||
|
if 'pdb' not in data or len(data['pdb']) != 4:
|
||
|
error['WRONGPDB'] = error.get('WRONGPDB', 0)+1
|
||
|
e= True
|
||
|
total += 1
|
||
|
if e:
|
||
|
error_record +=1
|
||
|
else:
|
||
|
valid +=1
|
||
|
return (error, valid, total, error_record)
|