103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
|
import gzip
|
||
|
import json
|
||
|
import os
|
||
|
from bs4 import BeautifulSoup
|
||
|
import re
|
||
|
import requests
|
||
|
import queue
|
||
|
import threading
|
||
|
import os
|
||
|
import shutil
|
||
|
|
||
|
regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
|
||
|
|
||
|
|
||
|
def extract_row(soup, bp):
|
||
|
l = []
|
||
|
ci = {}
|
||
|
tr_tags = soup.find_all('tr')
|
||
|
for row in tr_tags:
|
||
|
ci = {}
|
||
|
for td in row.find_all('td'):
|
||
|
if td.a:
|
||
|
ci['links']=f"{bp}{td.a.get('href')}"
|
||
|
if re.match(regex,td.text):
|
||
|
ci['date']= td.text.strip()
|
||
|
if('date' in ci):
|
||
|
l.append(ci)
|
||
|
return l
|
||
|
|
||
|
|
||
|
def worker(q):
|
||
|
work = True
|
||
|
while work:
|
||
|
item =q.get()
|
||
|
if item == "DONE":
|
||
|
work = False
|
||
|
q.put(item)
|
||
|
else:
|
||
|
html_doc = requests.get(item['links']).text
|
||
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
||
|
data =extract_row(soup, item['links'])
|
||
|
for f in data:
|
||
|
name = f['links'].split("/")[-1]
|
||
|
r = requests.get(f["links"])
|
||
|
print(f"DOWNLOADING {f['links']} into download/{name}")
|
||
|
with open(os.path.join("download", name), 'wb') as fd:
|
||
|
fd.write(r.content)
|
||
|
|
||
|
|
||
|
class PDBDownloader:
|
||
|
"""
|
||
|
PDB Downloader is a class that tries to download all the pdbs into a local download folder.
|
||
|
It uses ftp.ebi.ac.uk but the https website since the ftp not works
|
||
|
|
||
|
"""
|
||
|
def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
|
||
|
self.base_path = basePath
|
||
|
self.download_list = queue.Queue()
|
||
|
self.number_of_thread = number_of_thread
|
||
|
|
||
|
def get_file_to_downloads(self, max_item =-1):
|
||
|
""" This method run in parallels download of all the folders under
|
||
|
pub/databases/pdb/data/structures/divided/pdb/
|
||
|
"""
|
||
|
shutil.rmtree("download")
|
||
|
os.mkdir("download")
|
||
|
|
||
|
|
||
|
|
||
|
total_item = 0
|
||
|
html_doc = r = requests.get(self.base_path).text
|
||
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
||
|
data = extract_row(soup, self.base_path)
|
||
|
for item in data:
|
||
|
self.download_list.put(item)
|
||
|
total_item +=1
|
||
|
if max_item>0 and total_item > max_item:
|
||
|
break
|
||
|
self.download_list.put("DONE")
|
||
|
|
||
|
workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
|
||
|
for i in workers:
|
||
|
i.start()
|
||
|
i.join()
|
||
|
|
||
|
def get_snapshot(self):
|
||
|
html_doc = requests.get(self.base_path).text
|
||
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
||
|
data = extract_row(soup, self.base_path)
|
||
|
snapshots = []
|
||
|
|
||
|
for item in data:
|
||
|
snapshots.append(item)
|
||
|
print(f"get info for {item['links']}")
|
||
|
l_doc = requests.get(item['links']).text
|
||
|
s = BeautifulSoup(l_doc, 'html.parser')
|
||
|
d = extract_row(s, item['links'])
|
||
|
snapshots+=d
|
||
|
|
||
|
with gzip.open("snapshot.gz", "w") as w:
|
||
|
for item in snapshots:
|
||
|
w.write(json.dumps(item).encode())
|
||
|
w.write("\n".encode())
|