bioentities-preprocess/pdb/pdb_metadata_extractor.py

69 lines
2.6 KiB
Python

import gzip
import json
import glob
import datetime
import os
import shutil
class MetadataExctractor:
def __init__(self) -> None:
pass
def get_metadata(self, path) :
with gzip.open(path) as f:
p = {}
for line in f:
l = line.decode("utf-8").strip()
if l.startswith("HEADER"):
p['classification'] = l[10:49].strip().capitalize()
p['pdb'] = l[62:].strip()
p['deposition_date'] = d =datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
elif l.startswith("REMARK"):
break
elif l.startswith("TITLE"):
if "title" in p:
p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
else:
p['title'] =l[10:].strip().capitalize()
elif l.startswith("KEYWDS"):
if 'Keywords' in p:
p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")]
else :
p['Keywords'] = [s.capitalize() for s in l[11:].split(",")]
elif l.startswith("AUTHOR"):
if 'authors' in p:
p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")]
else :
p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")]
elif l.startswith("JRNL"):
if 'PMID' in l:
pmid = l[l.find("PMID")+5:].strip()
p['pmid']= pmid
if 'DOI' in l:
doi = l[l.find("DOI")+5:].strip()
p['doi']= doi
if l.startswith("REMARK"):
break
return(json.dumps(p))
def extract_metadata(self, input_path="download", output_path="metadata"):
if (os.path.exists(output_path) and os.path.isdir(output_path)):
shutil.rmtree(output_path)
os.mkdir(output_path)
part = 0
i = 0
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
for item in glob.glob(f"{input_path}/*.gz"):
i +=1
if i % 10000 ==0:
part +=1
w.flush()
w.close()
print(f"PARSED {i}")
w = gzip.open(f"dump_0{part}.gz", "w")
w.write(self.get_metadata(item).encode("utf-8"))
w.write("\n".encode("utf-8"))