69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
import gzip
|
|
import json
|
|
import glob
|
|
import datetime
|
|
import os
|
|
import shutil
|
|
|
|
class MetadataExctractor:
|
|
|
|
def __init__(self) -> None:
|
|
pass
|
|
|
|
def get_metadata(self, path) :
|
|
with gzip.open(path) as f:
|
|
p = {}
|
|
for line in f:
|
|
l = line.decode("utf-8").strip()
|
|
if l.startswith("HEADER"):
|
|
|
|
p['classification'] = l[10:49].strip().capitalize()
|
|
p['pdb'] = l[62:].strip()
|
|
p['deposition_date'] = datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
|
|
elif l.startswith("REMARK"):
|
|
break
|
|
elif l.startswith("TITLE"):
|
|
if "title" in p:
|
|
p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
|
|
else:
|
|
p['title'] =l[10:].strip().capitalize()
|
|
elif l.startswith("KEYWDS"):
|
|
if 'Keywords' in p:
|
|
p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")]
|
|
else :
|
|
p['Keywords'] = [s.capitalize() for s in l[11:].split(",")]
|
|
elif l.startswith("AUTHOR"):
|
|
if 'authors' in p:
|
|
p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")]
|
|
else :
|
|
p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")]
|
|
elif l.startswith("JRNL"):
|
|
if 'PMID' in l:
|
|
pmid = l[l.find("PMID")+5:].strip()
|
|
p['pmid']= pmid
|
|
if 'DOI' in l:
|
|
doi = l[l.find("DOI")+5:].strip()
|
|
p['doi']= doi
|
|
if l.startswith("REMARK"):
|
|
break
|
|
return(json.dumps(p))
|
|
|
|
|
|
def extract_metadata(self, input_path="download", output_path="pdb_metadata"):
|
|
if (os.path.exists(output_path) and os.path.isdir(output_path)):
|
|
shutil.rmtree(output_path)
|
|
os.mkdir(output_path)
|
|
part = 0
|
|
i = 0
|
|
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
|
|
for item in glob.glob(f"{input_path}/*.gz"):
|
|
i +=1
|
|
if i % 10000 ==0:
|
|
part +=1
|
|
w.flush()
|
|
w.close()
|
|
print(f"PARSED {i}")
|
|
w = gzip.open(f"dump_0{part}.gz", "w")
|
|
w.write(self.get_metadata(item).encode("utf-8"))
|
|
w.write("\n".encode("utf-8"))
|