import gzip import json import glob import datetime import os import shutil class MetadataExctractor: def __init__(self) -> None: pass def get_metadata(self, path) : with gzip.open(path) as f: p = {} for line in f: l = line.decode("utf-8").strip() if l.startswith("HEADER"): p['classification'] = l[10:49].strip().capitalize() p['pdb'] = l[62:].strip() p['deposition_date'] = datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d") elif l.startswith("REMARK"): break elif l.startswith("TITLE"): if "title" in p: p['title'] =f"{p['title']} {l[10:].strip().capitalize()}" else: p['title'] =l[10:].strip().capitalize() elif l.startswith("KEYWDS"): if 'Keywords' in p: p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")] else : p['Keywords'] = [s.capitalize() for s in l[11:].split(",")] elif l.startswith("AUTHOR"): if 'authors' in p: p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")] else : p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")] elif l.startswith("JRNL"): if 'PMID' in l: pmid = l[l.find("PMID")+5:].strip() p['pmid']= pmid if 'DOI' in l: doi = l[l.find("DOI")+5:].strip() p['doi']= doi if l.startswith("REMARK"): break return(json.dumps(p)) def extract_metadata(self, input_path="download", output_path="pdb_metadata"): if (os.path.exists(output_path) and os.path.isdir(output_path)): shutil.rmtree(output_path) os.mkdir(output_path) part = 0 i = 0 w = gzip.open(f"{output_path}/dump_0{part}.gz", "w") for item in glob.glob(f"{input_path}/*.gz"): i +=1 if i % 10000 ==0: part +=1 w.flush() w.close() print(f"PARSED {i}") w = gzip.open(f"dump_0{part}.gz", "w") w.write(self.get_metadata(item).encode("utf-8")) w.write("\n".encode("utf-8"))