import gzip import json import datetime import os import shutil import re regex = r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" class MetadataExctractor: def __init__(self, path="uniprot_sprot.dat.gz") -> None: print(f"Open Path {path}") self.current_file = gzip.open(path) def get_metadata(self) : p = {} subjCount =0 for line in self.current_file: l = line.decode("utf-8").strip() if l.startswith("AC"): p['pid'] =l[4:-1] elif l.startswith("DT "): cd ={} k = l[4:-1].split(',') cd['date'] =datetime.datetime.strptime(k[0].strip(),"%d-%b-%Y" ).strftime("%Y-%m-%d") cd['date_info'] = k[1].strip() dates = p.get("dates", []) dates.append(cd) p['dates']= dates elif l.startswith("DE "): k = l[4:-1] if 'RecName: Full=' in k: p['title'] = k[k.find('=')+1:] elif l.startswith("OS "): p['organism_species'] =l[4:-1].strip() elif l.startswith("OC ") and subjCount < 20: s = l[4:-1].strip().split(';') subjects = p.get('subjects', []) for sub in s: subjects.append(sub.strip()) subjCount += 1 p['subjects']= subjects elif l.startswith("RX "): references = [c.strip() for c in l[4:-1].strip().split(';')] relations = [] if len(references): for r in references: dd = r.split("=") if len(dd)==2 and dd[0].lower().strip() in ['pubmed', 'doi']: relations.append({dd[0]:dd[1]}) if len(relations): p["references"] = relations elif l.startswith("//"): s =json.dumps(p) subjCount = 0 del p p = {} yield s def extract_metadata(self, output_path="uniprot_metadata"): if (os.path.exists(output_path) and os.path.isdir(output_path)): shutil.rmtree(output_path) os.mkdir(output_path) part = 0 i = 0 w = gzip.open(f"{output_path}/dump_0{part}.gz", "w") for item in self.get_metadata(): i +=1 w.write(item.encode()) w.write("\n".encode()) if i % 100000==0: part +=1 w.flush() w.close() print(f"PARSED {i}") w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")