diff --git a/main.py b/main.py index f725008..3cb8629 100644 --- a/main.py +++ b/main.py @@ -3,12 +3,18 @@ from pdb.pdb_metadata_extractor import MetadataExctractor from pdb.pdb_validator import validate from uniprot.download import UniprotSwissDownloader from uniprot.metadata import MetadataExctractor as ME +from uniprot.uniprot_validator import validate as validate_uniprot if __name__ == '__main__': u = UniprotSwissDownloader() #u.download() k = ME() - k.extract_metadata() + error, valid, total, error_record = validate_uniprot() + print(error) + print(f"Valid {valid}/{total}") + print(f"Error {error_record}/{total}") + # k.extract_metadata() + # p = PDBDownloader() # p.get_file_to_downloads(max_item=4) diff --git a/uniprot/metadata.py b/uniprot/metadata.py index 3532262..67d0332 100644 --- a/uniprot/metadata.py +++ b/uniprot/metadata.py @@ -15,6 +15,7 @@ class MetadataExctractor: def get_metadata(self) : p = {} + subjCount =0 for line in self.current_file: l = line.decode("utf-8").strip() if l.startswith("AC"): @@ -34,11 +35,12 @@ class MetadataExctractor: elif l.startswith("OS "): p['organism_species'] =l[4:-1].strip() - elif l.startswith("OC "): + elif l.startswith("OC ") and subjCount < 20: s = l[4:-1].strip().split(';') subjects = p.get('subjects', []) for sub in s: subjects.append(sub.strip()) + subjCount += 1 p['subjects']= subjects elif l.startswith("RX "): references = [c.strip() for c in l[4:-1].strip().split(';')] @@ -51,7 +53,11 @@ class MetadataExctractor: if len(relations): p["references"] = relations elif l.startswith("//"): - yield json.dumps(p) + s =json.dumps(p) + subjCount = 0 + del p + p = {} + yield s def extract_metadata(self, output_path="uniprot_metadata"): @@ -65,11 +71,9 @@ class MetadataExctractor: i +=1 w.write(item.encode()) w.write("\n".encode()) - if i % 10000==0: + if i % 100000==0: part +=1 w.flush() w.close() print(f"PARSED {i}") - w = gzip.open(f"dump_0{part}.gz", "w") - - \ No newline at end of file + w = gzip.open(f"{output_path}/dump_0{part}.gz", "w") \ No newline at end of file diff --git a/uniprot/uniprot_validator.py b/uniprot/uniprot_validator.py new file mode 100644 index 0000000..f540611 --- /dev/null +++ b/uniprot/uniprot_validator.py @@ -0,0 +1,33 @@ +import json +import gzip +import glob + + + +def validate(input_path="uniprot_metadata") : + error = {} + valid = 0 + total = 0 + error_record = 0 + for i in glob.glob(f"{input_path}/dump*.gz"): + with gzip.open(i) as f: + for line in f: + data = json.loads(line.decode("utf-8")) + e = False + if "dates" not in data or data['dates'] is None or len(data['dates']) ==0: + error['MissingDate'] = error.get('MissingDate', 0)+1 + e= True + if 'pid' not in data and len(data['pid'])==0: + error['MissingLinks'] = error.get('MissingLinks', 0)+1 + e= True + if "title" not in data or data['title'] is None: + error['MissingTitle'] = error.get('MissingTitle', 0)+1 + e= True + if 'subjects' not in data or data['subjects'] is None: + error['MissingKeywords'] = error.get('MissingKeywords', 0)+1 + total += 1 + if e: + error_record +=1 + else: + valid +=1 + return (error, valid, total, error_record)