75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
import gzip
|
|
import json
|
|
import datetime
|
|
import os
|
|
import shutil
|
|
import re
|
|
|
|
regex = r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
|
|
|
|
class MetadataExctractor:
|
|
|
|
def __init__(self, path="uniprot_sprot.dat.gz") -> None:
|
|
print(f"Open Path {path}")
|
|
self.current_file = gzip.open(path)
|
|
|
|
def get_metadata(self) :
|
|
p = {}
|
|
for line in self.current_file:
|
|
l = line.decode("utf-8").strip()
|
|
if l.startswith("AC"):
|
|
p['pid'] =l[4:-1]
|
|
elif l.startswith("DT "):
|
|
cd ={}
|
|
k = l[4:-1].split(',')
|
|
cd['date'] =datetime.datetime.strptime(k[0].strip(),"%d-%b-%Y" ).strftime("%Y-%m-%d")
|
|
cd['date_info'] = k[1].strip()
|
|
dates = p.get("dates", [])
|
|
dates.append(cd)
|
|
p['dates']= dates
|
|
elif l.startswith("DE "):
|
|
k = l[4:-1]
|
|
if 'RecName: Full=' in k:
|
|
p['title'] = k[k.find('=')+1:]
|
|
elif l.startswith("OS "):
|
|
p['organism_species'] =l[4:-1].strip()
|
|
|
|
elif l.startswith("OC "):
|
|
s = l[4:-1].strip().split(';')
|
|
subjects = p.get('subjects', [])
|
|
for sub in s:
|
|
subjects.append(sub.strip())
|
|
p['subjects']= subjects
|
|
elif l.startswith("RX "):
|
|
references = [c.strip() for c in l[4:-1].strip().split(';')]
|
|
relations = []
|
|
if len(references):
|
|
for r in references:
|
|
dd = r.split("=")
|
|
if len(dd)==2 and dd[0].lower().strip() in ['pubmed', 'doi']:
|
|
relations.append({dd[0]:dd[1]})
|
|
if len(relations):
|
|
p["references"] = relations
|
|
elif l.startswith("//"):
|
|
yield json.dumps(p)
|
|
|
|
|
|
def extract_metadata(self, output_path="uniprot_metadata"):
|
|
if (os.path.exists(output_path) and os.path.isdir(output_path)):
|
|
shutil.rmtree(output_path)
|
|
os.mkdir(output_path)
|
|
part = 0
|
|
i = 0
|
|
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
|
|
for item in self.get_metadata():
|
|
i +=1
|
|
w.write(item.encode())
|
|
w.write("\n".encode())
|
|
if i % 10000==0:
|
|
part +=1
|
|
w.flush()
|
|
w.close()
|
|
print(f"PARSED {i}")
|
|
w = gzip.open(f"dump_0{part}.gz", "w")
|
|
|
|
|