bioentities-preprocess/uniprot/uniprot_validator.py

34 lines
1.2 KiB
Python

import json
import gzip
import glob
def validate(input_path="uniprot_metadata") :
error = {}
valid = 0
total = 0
error_record = 0
for i in glob.glob(f"{input_path}/dump*.gz"):
with gzip.open(i) as f:
for line in f:
data = json.loads(line.decode("utf-8"))
e = False
if "dates" not in data or data['dates'] is None or len(data['dates']) ==0:
error['MissingDate'] = error.get('MissingDate', 0)+1
e= True
if 'pid' not in data and len(data['pid'])==0:
error['MissingLinks'] = error.get('MissingLinks', 0)+1
e= True
if "title" not in data or data['title'] is None:
error['MissingTitle'] = error.get('MissingTitle', 0)+1
e= True
if 'subjects' not in data or data['subjects'] is None:
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
total += 1
if e:
error_record +=1
else:
valid +=1
return (error, valid, total, error_record)