bioentities-preprocess/uniprot/metadata.py

import gzip
import json
import datetime
import os
import shutil
import re

regex = r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"

class MetadataExctractor:

    def __init__(self, path="uniprot_sprot.dat.gz") -> None:
        print(f"Open Path {path}")
        self.current_file = gzip.open(path)

    def get_metadata(self) :
        p = {}
        for line in self.current_file:
            l = line.decode("utf-8").strip()
            if l.startswith("AC"):
                p['pid'] =l[4:-1]
            elif l.startswith("DT "):
                cd ={}
                k = l[4:-1].split(',')
                cd['date'] =datetime.datetime.strptime(k[0].strip(),"%d-%b-%Y" ).strftime("%Y-%m-%d")
                cd['date_info'] = k[1].strip()
                dates = p.get("dates", [])
                dates.append(cd)
                p['dates']= dates
            elif l.startswith("DE "):
                k = l[4:-1]
                if 'RecName: Full=' in k:
                    p['title'] = k[k.find('=')+1:]
            elif l.startswith("OS "):
                p['organism_species'] =l[4:-1].strip()

            elif l.startswith("OC "):
               s = l[4:-1].strip().split(';')
               subjects = p.get('subjects', [])
               for sub in s:
                   subjects.append(sub.strip())
               p['subjects']= subjects
            elif l.startswith("RX "):
                references = [c.strip() for c in l[4:-1].strip().split(';')]
                relations = []
                if len(references):
                    for r in references:
                        dd = r.split("=")
                        if len(dd)==2 and dd[0].lower().strip() in ['pubmed', 'doi']:
                            relations.append({dd[0]:dd[1]})
                if len(relations):
                    p["references"] = relations
            elif l.startswith("//"):
                yield json.dumps(p)


    def extract_metadata(self, output_path="uniprot_metadata"):
        if (os.path.exists(output_path) and os.path.isdir(output_path)):
            shutil.rmtree(output_path)
        os.mkdir(output_path)
        part = 0
        i = 0
        w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
        for item in self.get_metadata():
            i +=1
            w.write(item.encode())
            w.write("\n".encode())
            if i % 10000==0:
                part +=1
                w.flush()
                w.close()
                print(f"PARSED {i}")
                w = gzip.open(f"dump_0{part}.gz", "w")