import json import gzip import glob def validate(input_path="metadata") : error = {} valid = 0 total = 0 error_record = 0 for i in glob.glob(f"{input_path}/dump*.gz"): with gzip.open(i) as f: for line in f: data = json.loads(line.decode("utf-8")) e = False if "deposition_date" not in data or data['deposition_date'] is None: error['MissingDate'] = error.get('MissingDate', 0)+1 e= True if 'pmid' not in data and 'doi' not in data: error['MissingLinks'] = error.get('MissingLinks', 0)+1 e= True if 'authors' not in data or len(data['authors']) <1: error['MissingAuthor'] = error.get('MissingAuthor', 0)+1 e= True if "title" not in data or data['title'] is None: error['MissingTitle'] = error.get('MissingTitle', 0)+1 e= True if 'Keywords' not in data or data['Keywords'] is None: error['MissingKeywords'] = error.get('MissingKeywords', 0)+1 if 'pdb' not in data or len(data['pdb']) != 4: error['WRONGPDB'] = error.get('WRONGPDB', 0)+1 e= True total += 1 if e: error_record +=1 else: valid +=1 return (error, valid, total, error_record)