imported pdb preprocessing scripts

2023-09-20 15:16:12 +02:00 · 2023-09-20 15:16:12 +02:00 · a9712c63d1
parent 1cb03ea7a0
commit a9712c63d1
7 changed files with 234 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
 download
 metadata
 develop-eggs/
 dist/
 downloads/
--- a/main.py
+++ b/main.py
@ -0,0 +1,14 @@
 from pdb.pdb_download import PDBDownloader
 from pdb.pdb_metadata_extractor import MetadataExctractor
 from pdb.pdb_validator import validate
 if __name__ == '__main__':
    p = PDBDownloader()
    p.get_file_to_downloads(max_item=4)
    m = MetadataExctractor()
    m.extract_metadata()
    error, valid, total, error_record = validate()
    print(error)
    print(f"Valid {valid}/{total}")
    print(f"Error {error_record}/{total}")
--- a/pdb/init.py
+++ b/pdb/init.py
--- a/pdb/pdb_download.py
+++ b/pdb/pdb_download.py
@ -0,0 +1,103 @@
 import gzip
 import json
 import os
 from bs4 import BeautifulSoup
 import re
 import requests
 import queue
 import threading
 import os
 import shutil
 regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
 def extract_row(soup, bp):
    l = []
    ci = {}
    tr_tags = soup.find_all('tr')
    for row in tr_tags:
        ci = {}
        for td in row.find_all('td'):
            if td.a:                    
                ci['links']=f"{bp}{td.a.get('href')}"
            if re.match(regex,td.text):
                ci['date']= td.text.strip()
        if('date' in ci):
            l.append(ci)
    return l
 def worker(q):
    work = True
    while work:
        item =q.get()
        if item == "DONE":
            work = False
            q.put(item)
        else:
            html_doc = requests.get(item['links']).text
            soup = BeautifulSoup(html_doc, 'html.parser')
            data =extract_row(soup, item['links'])
            for f in data:
                name = f['links'].split("/")[-1]
                r = requests.get(f["links"])
                print(f"DOWNLOADING {f['links']} into download/{name}")
                with open(os.path.join("download", name), 'wb') as fd:
                    fd.write(r.content)
 class PDBDownloader:
    """
        PDB Downloader is a class that tries to download all the pdbs into a local download folder.
        It uses ftp.ebi.ac.uk but the https website since the ftp not works
    """
    def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
        self.base_path = basePath
        self.download_list = queue.Queue()
        self.number_of_thread = number_of_thread
    def get_file_to_downloads(self, max_item =-1):
        """ This method run in parallels download of all the folders under  
            pub/databases/pdb/data/structures/divided/pdb/
        """
        shutil.rmtree("download")
        os.mkdir("download")
        total_item = 0
        html_doc = r = requests.get(self.base_path).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        data = extract_row(soup, self.base_path)
        for item in data:
            self.download_list.put(item)
            total_item +=1
            if max_item>0 and total_item > max_item:
                break
        self.download_list.put("DONE")
        workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
        for i in workers:
            i.start()
            i.join()
    def get_snapshot(self):
        html_doc = requests.get(self.base_path).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        data = extract_row(soup, self.base_path)
        snapshots = []
        for item in data:
            snapshots.append(item)
            print(f"get info for {item['links']}")
            l_doc = requests.get(item['links']).text
            s = BeautifulSoup(l_doc, 'html.parser')
            d = extract_row(s, item['links'])
            snapshots+=d
        with gzip.open("snapshot.gz", "w") as w:
            for item in snapshots:
                w.write(json.dumps(item).encode())
                w.write("\n".encode())
--- a/pdb/pdb_metadata_extractor.py
+++ b/pdb/pdb_metadata_extractor.py
@ -0,0 +1,68 @@
 import gzip
 import json
 import glob
 import datetime
 import os
 import shutil
 class MetadataExctractor:
    def __init__(self) -> None:
        pass
    def get_metadata(self, path) :
        with gzip.open(path) as f:
            p = {}
            for line in f:
                l = line.decode("utf-8").strip()
                if l.startswith("HEADER"):
                    p['classification'] = l[10:49].strip().capitalize()
                    p['pdb'] = l[62:].strip()
                    p['deposition_date'] = d =datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
                elif l.startswith("REMARK"):
                    break
                elif l.startswith("TITLE"):
                    if "title" in p:
                        p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
                    else:
                        p['title'] =l[10:].strip().capitalize()
                elif l.startswith("KEYWDS"):                 
                    if 'Keywords' in p:
                        p['Keywords'] =p['Keywords'] + [s.strip().capitalize()  for s in l[11:].split(",")]
                    else :
                        p['Keywords'] = [s.capitalize()  for s in l[11:].split(",")]
                elif l.startswith("AUTHOR"):                 
                    if 'authors' in p:
                        p['authors'] =p['authors'] + [s.strip().capitalize()  for s in l[10:].split(",")]
                    else :
                        p['authors'] = [s.strip().capitalize()  for s in l[10:].split(",")]
                elif l.startswith("JRNL"):                 
                    if 'PMID' in l:
                        pmid = l[l.find("PMID")+5:].strip()
                        p['pmid']= pmid
                    if 'DOI' in l:
                        doi = l[l.find("DOI")+5:].strip()
                        p['doi']= doi
                if l.startswith("REMARK"):
                    break
            return(json.dumps(p))
    def extract_metadata(self, input_path="download", output_path="metadata"):
        if (os.path.exists(output_path) and os.path.isdir(output_path)):
            shutil.rmtree(output_path)
        os.mkdir(output_path)
        part = 0
        i = 0
        w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
        for item in glob.glob(f"{input_path}/*.gz"):
            i +=1
            if i % 10000 ==0:
                part +=1
                w.flush()
                w.close()
                print(f"PARSED {i}")
                w = gzip.open(f"dump_0{part}.gz", "w")
            w.write(self.get_metadata(item).encode("utf-8"))
            w.write("\n".encode("utf-8"))
--- a/pdb/pdb_validator.py
+++ b/pdb/pdb_validator.py
@ -0,0 +1,39 @@
 import json
 import gzip
 import glob
 def validate(input_path="metadata") :
    error = {}
    valid = 0
    total = 0
    error_record = 0
    for i in glob.glob(f"{input_path}/dump*.gz"):
        with gzip.open(i) as f:
            for line in f:
                data = json.loads(line.decode("utf-8"))
                e = False
                if "deposition_date" not in data or data['deposition_date'] is None:
                    error['MissingDate'] = error.get('MissingDate', 0)+1
                    e= True
                if 'pmid' not in data and 'doi' not in data:
                    error['MissingLinks'] = error.get('MissingLinks', 0)+1            
                    e= True
                if 'authors' not in data or len(data['authors']) <1:
                    error['MissingAuthor'] = error.get('MissingAuthor', 0)+1
                    e= True
                if "title" not in data or data['title'] is None:
                    error['MissingTitle'] = error.get('MissingTitle', 0)+1
                    e= True
                if 'Keywords' not in data or data['Keywords'] is None:
                    error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
                if 'pdb' not in data or len(data['pdb']) != 4:
                    error['WRONGPDB'] = error.get('WRONGPDB', 0)+1
                    e= True
                total += 1
                if e:
                    error_record +=1
                else:
                    valid +=1
    return (error, valid, total, error_record)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 beautifulsoup4==4.12.2
 bs4==0.0.1
 certifi==2023.7.22
 charset-normalizer==3.2.0
 idna==3.4
 requests==2.31.0
 soupsieve==2.5
 urllib3==2.0.4