bioentities-preprocess/pdb/pdb_download.py

103 lines
3.2 KiB
Python

import gzip
import json
import os
from bs4 import BeautifulSoup
import re
import requests
import queue
import threading
import os
import shutil
regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
def extract_row(soup, bp):
l = []
ci = {}
tr_tags = soup.find_all('tr')
for row in tr_tags:
ci = {}
for td in row.find_all('td'):
if td.a:
ci['links']=f"{bp}{td.a.get('href')}"
if re.match(regex,td.text):
ci['date']= td.text.strip()
if('date' in ci):
l.append(ci)
return l
def worker(q):
work = True
while work:
item =q.get()
if item == "DONE":
work = False
q.put(item)
else:
html_doc = requests.get(item['links']).text
soup = BeautifulSoup(html_doc, 'html.parser')
data =extract_row(soup, item['links'])
for f in data:
name = f['links'].split("/")[-1]
r = requests.get(f["links"])
print(f"DOWNLOADING {f['links']} into download/{name}")
with open(os.path.join("download", name), 'wb') as fd:
fd.write(r.content)
class PDBDownloader:
"""
PDB Downloader is a class that tries to download all the pdbs into a local download folder.
It uses ftp.ebi.ac.uk but the https website since the ftp not works
"""
def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
self.base_path = basePath
self.download_list = queue.Queue()
self.number_of_thread = number_of_thread
def get_file_to_downloads(self, max_item =-1):
""" This method run in parallels download of all the folders under
pub/databases/pdb/data/structures/divided/pdb/
"""
shutil.rmtree("download")
os.mkdir("download")
total_item = 0
html_doc = r = requests.get(self.base_path).text
soup = BeautifulSoup(html_doc, 'html.parser')
data = extract_row(soup, self.base_path)
for item in data:
self.download_list.put(item)
total_item +=1
if max_item>0 and total_item > max_item:
break
self.download_list.put("DONE")
workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
for i in workers:
i.start()
i.join()
def get_snapshot(self):
html_doc = requests.get(self.base_path).text
soup = BeautifulSoup(html_doc, 'html.parser')
data = extract_row(soup, self.base_path)
snapshots = []
for item in data:
snapshots.append(item)
print(f"get info for {item['links']}")
l_doc = requests.get(item['links']).text
s = BeautifulSoup(l_doc, 'html.parser')
d = extract_row(s, item['links'])
snapshots+=d
with gzip.open("snapshot.gz", "w") as w:
for item in snapshots:
w.write(json.dumps(item).encode())
w.write("\n".encode())