diff --git a/.gitignore b/.gitignore index 9a474f2..d452d39 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ __pycache__/ # Distribution / packaging .Python build/ +download +metadata develop-eggs/ dist/ downloads/ diff --git a/main.py b/main.py new file mode 100644 index 0000000..92f8d70 --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +from pdb.pdb_download import PDBDownloader +from pdb.pdb_metadata_extractor import MetadataExctractor +from pdb.pdb_validator import validate + + +if __name__ == '__main__': + p = PDBDownloader() + p.get_file_to_downloads(max_item=4) + m = MetadataExctractor() + m.extract_metadata() + error, valid, total, error_record = validate() + print(error) + print(f"Valid {valid}/{total}") + print(f"Error {error_record}/{total}") diff --git a/pdb/__init__.py b/pdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pdb/pdb_download.py b/pdb/pdb_download.py new file mode 100644 index 0000000..8ca26e3 --- /dev/null +++ b/pdb/pdb_download.py @@ -0,0 +1,103 @@ +import gzip +import json +import os +from bs4 import BeautifulSoup +import re +import requests +import queue +import threading +import os +import shutil + +regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}" + + +def extract_row(soup, bp): + l = [] + ci = {} + tr_tags = soup.find_all('tr') + for row in tr_tags: + ci = {} + for td in row.find_all('td'): + if td.a: + ci['links']=f"{bp}{td.a.get('href')}" + if re.match(regex,td.text): + ci['date']= td.text.strip() + if('date' in ci): + l.append(ci) + return l + + +def worker(q): + work = True + while work: + item =q.get() + if item == "DONE": + work = False + q.put(item) + else: + html_doc = requests.get(item['links']).text + soup = BeautifulSoup(html_doc, 'html.parser') + data =extract_row(soup, item['links']) + for f in data: + name = f['links'].split("/")[-1] + r = requests.get(f["links"]) + print(f"DOWNLOADING {f['links']} into download/{name}") + with open(os.path.join("download", name), 'wb') as fd: + fd.write(r.content) + + +class PDBDownloader: + """ + PDB Downloader is a class that tries to download all the pdbs into a local download folder. + It uses ftp.ebi.ac.uk but the https website since the ftp not works + + """ + def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None: + self.base_path = basePath + self.download_list = queue.Queue() + self.number_of_thread = number_of_thread + + def get_file_to_downloads(self, max_item =-1): + """ This method run in parallels download of all the folders under + pub/databases/pdb/data/structures/divided/pdb/ + """ + shutil.rmtree("download") + os.mkdir("download") + + + + total_item = 0 + html_doc = r = requests.get(self.base_path).text + soup = BeautifulSoup(html_doc, 'html.parser') + data = extract_row(soup, self.base_path) + for item in data: + self.download_list.put(item) + total_item +=1 + if max_item>0 and total_item > max_item: + break + self.download_list.put("DONE") + + workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)] + for i in workers: + i.start() + i.join() + + def get_snapshot(self): + html_doc = requests.get(self.base_path).text + soup = BeautifulSoup(html_doc, 'html.parser') + data = extract_row(soup, self.base_path) + snapshots = [] + + for item in data: + snapshots.append(item) + print(f"get info for {item['links']}") + l_doc = requests.get(item['links']).text + s = BeautifulSoup(l_doc, 'html.parser') + d = extract_row(s, item['links']) + snapshots+=d + + with gzip.open("snapshot.gz", "w") as w: + for item in snapshots: + w.write(json.dumps(item).encode()) + w.write("\n".encode()) \ No newline at end of file diff --git a/pdb/pdb_metadata_extractor.py b/pdb/pdb_metadata_extractor.py new file mode 100644 index 0000000..f50f700 --- /dev/null +++ b/pdb/pdb_metadata_extractor.py @@ -0,0 +1,68 @@ +import gzip +import json +import glob +import datetime +import os +import shutil + +class MetadataExctractor: + + def __init__(self) -> None: + pass + + def get_metadata(self, path) : + with gzip.open(path) as f: + p = {} + for line in f: + l = line.decode("utf-8").strip() + if l.startswith("HEADER"): + + p['classification'] = l[10:49].strip().capitalize() + p['pdb'] = l[62:].strip() + p['deposition_date'] = d =datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d") + elif l.startswith("REMARK"): + break + elif l.startswith("TITLE"): + if "title" in p: + p['title'] =f"{p['title']} {l[10:].strip().capitalize()}" + else: + p['title'] =l[10:].strip().capitalize() + elif l.startswith("KEYWDS"): + if 'Keywords' in p: + p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")] + else : + p['Keywords'] = [s.capitalize() for s in l[11:].split(",")] + elif l.startswith("AUTHOR"): + if 'authors' in p: + p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")] + else : + p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")] + elif l.startswith("JRNL"): + if 'PMID' in l: + pmid = l[l.find("PMID")+5:].strip() + p['pmid']= pmid + if 'DOI' in l: + doi = l[l.find("DOI")+5:].strip() + p['doi']= doi + if l.startswith("REMARK"): + break + return(json.dumps(p)) + + + def extract_metadata(self, input_path="download", output_path="metadata"): + if (os.path.exists(output_path) and os.path.isdir(output_path)): + shutil.rmtree(output_path) + os.mkdir(output_path) + part = 0 + i = 0 + w = gzip.open(f"{output_path}/dump_0{part}.gz", "w") + for item in glob.glob(f"{input_path}/*.gz"): + i +=1 + if i % 10000 ==0: + part +=1 + w.flush() + w.close() + print(f"PARSED {i}") + w = gzip.open(f"dump_0{part}.gz", "w") + w.write(self.get_metadata(item).encode("utf-8")) + w.write("\n".encode("utf-8")) diff --git a/pdb/pdb_validator.py b/pdb/pdb_validator.py new file mode 100644 index 0000000..07d2238 --- /dev/null +++ b/pdb/pdb_validator.py @@ -0,0 +1,39 @@ +import json +import gzip +import glob + + + +def validate(input_path="metadata") : + error = {} + valid = 0 + total = 0 + error_record = 0 + for i in glob.glob(f"{input_path}/dump*.gz"): + with gzip.open(i) as f: + for line in f: + data = json.loads(line.decode("utf-8")) + e = False + if "deposition_date" not in data or data['deposition_date'] is None: + error['MissingDate'] = error.get('MissingDate', 0)+1 + e= True + if 'pmid' not in data and 'doi' not in data: + error['MissingLinks'] = error.get('MissingLinks', 0)+1 + e= True + if 'authors' not in data or len(data['authors']) <1: + error['MissingAuthor'] = error.get('MissingAuthor', 0)+1 + e= True + if "title" not in data or data['title'] is None: + error['MissingTitle'] = error.get('MissingTitle', 0)+1 + e= True + if 'Keywords' not in data or data['Keywords'] is None: + error['MissingKeywords'] = error.get('MissingKeywords', 0)+1 + if 'pdb' not in data or len(data['pdb']) != 4: + error['WRONGPDB'] = error.get('WRONGPDB', 0)+1 + e= True + total += 1 + if e: + error_record +=1 + else: + valid +=1 + return (error, valid, total, error_record) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..06e1c60 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.12.2 +bs4==0.0.1 +certifi==2023.7.22 +charset-normalizer==3.2.0 +idna==3.4 +requests==2.31.0 +soupsieve==2.5 +urllib3==2.0.4