imported uniprot preprocessing scripts

imported pdb preprocessing scripts
2023-09-21 15:08:48 +02:00 · 2023-09-20 15:16:12 +02:00
10 changed files with 345 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,11 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+download
+metadata
+uniprot_metadata
+uniprot_sprot.dat.gz
+dump*.gz
 develop-eggs/
 dist/
 downloads/
--- a/main.py
+++ b/main.py
@ -0,0 +1,23 @@
+from pdb.pdb_download import PDBDownloader
+from pdb.pdb_metadata_extractor import MetadataExctractor
+from pdb.pdb_validator import validate
+from uniprot.download import UniprotSwissDownloader
+from uniprot.metadata import MetadataExctractor as ME
+
+if __name__ == '__main__':
+    u = UniprotSwissDownloader()
+    #u.download()
+    k = ME()
+    k.extract_metadata()
+
+    # p = PDBDownloader()
+    # p.get_file_to_downloads(max_item=4)
+    # m = MetadataExctractor()
+    # m.extract_metadata()
+    # error, valid, total, error_record = validate()
+    # print(error)
+    # print(f"Valid {valid}/{total}")
+    # print(f"Error {error_record}/{total}")
+
+
+    
--- a/pdb/init.py
+++ b/pdb/init.py
--- a/pdb/pdb_download.py
+++ b/pdb/pdb_download.py
@ -0,0 +1,103 @@
+import gzip
+import json
+import os
+from bs4 import BeautifulSoup
+import re
+import requests
+import queue
+import threading
+import os
+import shutil
+
+regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
+
+
+def extract_row(soup, bp):
+    l = []
+    ci = {}
+    tr_tags = soup.find_all('tr')
+    for row in tr_tags:
+        ci = {}
+        for td in row.find_all('td'):
+            if td.a:                    
+                ci['links']=f"{bp}{td.a.get('href')}"
+            if re.match(regex,td.text):
+                ci['date']= td.text.strip()
+        if('date' in ci):
+            l.append(ci)
+    return l
+    
+
+def worker(q):
+    work = True
+    while work:
+        item =q.get()
+        if item == "DONE":
+            work = False
+            q.put(item)
+        else:
+            html_doc = requests.get(item['links']).text
+            soup = BeautifulSoup(html_doc, 'html.parser')
+            data =extract_row(soup, item['links'])
+            for f in data:
+                name = f['links'].split("/")[-1]
+                r = requests.get(f["links"])
+                print(f"DOWNLOADING {f['links']} into download/{name}")
+                with open(os.path.join("download", name), 'wb') as fd:
+                    fd.write(r.content)
+                
+
+class PDBDownloader:
+    """
+        PDB Downloader is a class that tries to download all the pdbs into a local download folder.
+        It uses ftp.ebi.ac.uk but the https website since the ftp not works
+
+    """
+    def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
+        self.base_path = basePath
+        self.download_list = queue.Queue()
+        self.number_of_thread = number_of_thread
+
+    def get_file_to_downloads(self, max_item =-1):
+        """ This method run in parallels download of all the folders under  
+            pub/databases/pdb/data/structures/divided/pdb/
+        """
+        shutil.rmtree("download")
+        os.mkdir("download")
+        
+            
+            
+        total_item = 0
+        html_doc = r = requests.get(self.base_path).text
+        soup = BeautifulSoup(html_doc, 'html.parser')
+        data = extract_row(soup, self.base_path)
+        for item in data:
+            self.download_list.put(item)
+            total_item +=1
+            if max_item>0 and total_item > max_item:
+                break
+        self.download_list.put("DONE")
+
+        workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
+        for i in workers:
+            i.start()
+            i.join()
+
+    def get_snapshot(self):
+        html_doc = requests.get(self.base_path).text
+        soup = BeautifulSoup(html_doc, 'html.parser')
+        data = extract_row(soup, self.base_path)
+        snapshots = []
+
+        for item in data:
+            snapshots.append(item)
+            print(f"get info for {item['links']}")
+            l_doc = requests.get(item['links']).text
+            s = BeautifulSoup(l_doc, 'html.parser')
+            d = extract_row(s, item['links'])
+            snapshots+=d
+
+        with gzip.open("snapshot.gz", "w") as w:
+            for item in snapshots:
+                w.write(json.dumps(item).encode())
+                w.write("\n".encode())
--- a/pdb/pdb_metadata_extractor.py
+++ b/pdb/pdb_metadata_extractor.py
@ -0,0 +1,68 @@
+import gzip
+import json
+import glob
+import datetime
+import os
+import shutil
+
+class MetadataExctractor:
+
+    def __init__(self) -> None:
+        pass
+
+    def get_metadata(self, path) :
+        with gzip.open(path) as f:
+            p = {}
+            for line in f:
+                l = line.decode("utf-8").strip()
+                if l.startswith("HEADER"):
+                    
+                    p['classification'] = l[10:49].strip().capitalize()
+                    p['pdb'] = l[62:].strip()
+                    p['deposition_date'] = datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
+                elif l.startswith("REMARK"):
+                    break
+                elif l.startswith("TITLE"):
+                    if "title" in p:
+                        p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
+                    else:
+                        p['title'] =l[10:].strip().capitalize()
+                elif l.startswith("KEYWDS"):                 
+                    if 'Keywords' in p:
+                        p['Keywords'] =p['Keywords'] + [s.strip().capitalize()  for s in l[11:].split(",")]
+                    else :
+                        p['Keywords'] = [s.capitalize()  for s in l[11:].split(",")]
+                elif l.startswith("AUTHOR"):                 
+                    if 'authors' in p:
+                        p['authors'] =p['authors'] + [s.strip().capitalize()  for s in l[10:].split(",")]
+                    else :
+                        p['authors'] = [s.strip().capitalize()  for s in l[10:].split(",")]
+                elif l.startswith("JRNL"):                 
+                    if 'PMID' in l:
+                        pmid = l[l.find("PMID")+5:].strip()
+                        p['pmid']= pmid
+                    if 'DOI' in l:
+                        doi = l[l.find("DOI")+5:].strip()
+                        p['doi']= doi
+                if l.startswith("REMARK"):
+                    break
+            return(json.dumps(p))
+        
+
+    def extract_metadata(self, input_path="download", output_path="metadata"):
+        if (os.path.exists(output_path) and os.path.isdir(output_path)):
+            shutil.rmtree(output_path)
+        os.mkdir(output_path)
+        part = 0
+        i = 0
+        w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
+        for item in glob.glob(f"{input_path}/*.gz"):
+            i +=1
+            if i % 10000 ==0:
+                part +=1
+                w.flush()
+                w.close()
+                print(f"PARSED {i}")
+                w = gzip.open(f"dump_0{part}.gz", "w")
+            w.write(self.get_metadata(item).encode("utf-8"))
+            w.write("\n".encode("utf-8"))
--- a/pdb/pdb_validator.py
+++ b/pdb/pdb_validator.py
@ -0,0 +1,39 @@
+import json
+import gzip
+import glob
+
+
+
+def validate(input_path="metadata") :
+    error = {}
+    valid = 0
+    total = 0
+    error_record = 0
+    for i in glob.glob(f"{input_path}/dump*.gz"):
+        with gzip.open(i) as f:
+            for line in f:
+                data = json.loads(line.decode("utf-8"))
+                e = False
+                if "deposition_date" not in data or data['deposition_date'] is None:
+                    error['MissingDate'] = error.get('MissingDate', 0)+1
+                    e= True
+                if 'pmid' not in data and 'doi' not in data:
+                    error['MissingLinks'] = error.get('MissingLinks', 0)+1            
+                    e= True
+                if 'authors' not in data or len(data['authors']) <1:
+                    error['MissingAuthor'] = error.get('MissingAuthor', 0)+1
+                    e= True
+                if "title" not in data or data['title'] is None:
+                    error['MissingTitle'] = error.get('MissingTitle', 0)+1
+                    e= True
+                if 'Keywords' not in data or data['Keywords'] is None:
+                    error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
+                if 'pdb' not in data or len(data['pdb']) != 4:
+                    error['WRONGPDB'] = error.get('WRONGPDB', 0)+1
+                    e= True
+                total += 1
+                if e:
+                    error_record +=1
+                else:
+                    valid +=1
+    return (error, valid, total, error_record)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+beautifulsoup4==4.12.2
+bs4==0.0.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+idna==3.4
+requests==2.31.0
+soupsieve==2.5
+urllib3==2.0.4
--- a/uniprot/init.py
+++ b/uniprot/init.py
--- a/uniprot/download.py
+++ b/uniprot/download.py
@ -0,0 +1,24 @@
+import requests
+import sys
+class UniprotSwissDownloader():
+    def __init__(self, url = "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz") -> None:
+        self.url = url
+
+
+    def download(self):
+        r= requests.get(self.url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        with open("uniprot_sprot.dat.gz", 'wb') as f:
+            if total_length is None: # no content length header
+                f.write(r.content)
+            else:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\rDownloading[%s%s]" % ('=' * done, ' ' * (50-done)) )    
+                    sys.stdout.flush()
+    
--- a/uniprot/metadata.py
+++ b/uniprot/metadata.py
@ -0,0 +1,75 @@
+import gzip
+import json
+import datetime
+import os
+import shutil
+import re
+
+regex = r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+
+class MetadataExctractor:
+
+    def __init__(self, path="uniprot_sprot.dat.gz") -> None:
+        print(f"Open Path {path}")
+        self.current_file = gzip.open(path)
+
+    def get_metadata(self) :
+        p = {}
+        for line in self.current_file:
+            l = line.decode("utf-8").strip()            
+            if l.startswith("AC"):                
+                p['pid'] =l[4:-1]
+            elif l.startswith("DT "):
+                cd ={}
+                k = l[4:-1].split(',')
+                cd['date'] =datetime.datetime.strptime(k[0].strip(),"%d-%b-%Y" ).strftime("%Y-%m-%d")
+                cd['date_info'] = k[1].strip()
+                dates = p.get("dates", [])
+                dates.append(cd)
+                p['dates']= dates
+            elif l.startswith("DE "):
+                k = l[4:-1]
+                if 'RecName: Full=' in k:
+                    p['title'] = k[k.find('=')+1:]
+            elif l.startswith("OS "):                 
+                p['organism_species'] =l[4:-1].strip()
+                
+            elif l.startswith("OC "):
+               s = l[4:-1].strip().split(';')                
+               subjects = p.get('subjects', []) 
+               for sub in s:
+                   subjects.append(sub.strip())
+               p['subjects']= subjects
+            elif l.startswith("RX "):
+                references = [c.strip() for c in l[4:-1].strip().split(';')]
+                relations = []
+                if len(references):
+                    for r in references:
+                        dd = r.split("=")
+                        if len(dd)==2 and dd[0].lower().strip() in ['pubmed', 'doi']:
+                            relations.append({dd[0]:dd[1]})
+                if len(relations):
+                    p["references"] = relations
+            elif l.startswith("//"):
+                yield json.dumps(p)
+
+
+    def extract_metadata(self, output_path="uniprot_metadata"):
+        if (os.path.exists(output_path) and os.path.isdir(output_path)):
+            shutil.rmtree(output_path)
+        os.mkdir(output_path)
+        part = 0
+        i = 0
+        w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
+        for item in self.get_metadata():
+            i +=1
+            w.write(item.encode())
+            w.write("\n".encode())
+            if i % 10000==0:
+                part +=1
+                w.flush()
+                w.close()
+                print(f"PARSED {i}")
+                w = gzip.open(f"dump_0{part}.gz", "w")
+
+
Author	SHA1	Message	Date
Sandro La Bruzzo	7decdade98	imported uniprot preprocessing scripts	2023-09-21 15:08:48 +02:00
Sandro La Bruzzo	a9712c63d1	imported pdb preprocessing scripts	2023-09-20 15:16:12 +02:00