bioentities-preprocess/pdb/pdb_download.py

import gzip
import json
import os
from bs4 import BeautifulSoup
import re
import requests
import queue
import threading
import os
import shutil

regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"


def extract_row(soup, bp):
    l = []
    ci = {}
    tr_tags = soup.find_all('tr')
    for row in tr_tags:
        ci = {}
        for td in row.find_all('td'):
            if td.a:                    
                ci['links']=f"{bp}{td.a.get('href')}"
            if re.match(regex,td.text):
                ci['date']= td.text.strip()
        if('date' in ci):
            l.append(ci)
    return l
    

def worker(q):
    work = True
    while work:
        item =q.get()
        if item == "DONE":
            work = False
            q.put(item)
        else:
            html_doc = requests.get(item['links']).text
            soup = BeautifulSoup(html_doc, 'html.parser')
            data =extract_row(soup, item['links'])
            for f in data:
                name = f['links'].split("/")[-1]
                r = requests.get(f["links"])
                print(f"DOWNLOADING {f['links']} into download/{name}")
                with open(os.path.join("download", name), 'wb') as fd:
                    fd.write(r.content)
                

class PDBDownloader:
    """
        PDB Downloader is a class that tries to download all the pdbs into a local download folder.
        It uses ftp.ebi.ac.uk but the https website since the ftp not works

    """
    def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
        self.base_path = basePath
        self.download_list = queue.Queue()
        self.number_of_thread = number_of_thread

    def get_file_to_downloads(self, max_item =-1):
        """ This method run in parallels download of all the folders under  
            pub/databases/pdb/data/structures/divided/pdb/
        """
        shutil.rmtree("download")
        os.mkdir("download")
        
            
        total_item = 0
        html_doc = r = requests.get(self.base_path).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        data = extract_row(soup, self.base_path)
        for item in data:
            self.download_list.put(item)
            total_item +=1
            if max_item>0 and total_item > max_item:
                break
        self.download_list.put("DONE")

        workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
        for i in workers:
            i.start()
            i.join()

    def get_snapshot(self):
        html_doc = requests.get(self.base_path).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        data = extract_row(soup, self.base_path)
        snapshots = []

        for item in data:
            snapshots.append(item)
            print(f"get info for {item['links']}")
            l_doc = requests.get(item['links']).text
            s = BeautifulSoup(l_doc, 'html.parser')
            d = extract_row(s, item['links'])
            snapshots+=d

        with gzip.open("snapshot.gz", "w") as w:
            for item in snapshots:
                w.write(json.dumps(item).encode())
                w.write("\n".encode())
imported pdb preprocessing scripts 2023-09-20 15:16:12 +02:00			`import gzip`
			`import json`
			`import os`
			`from bs4 import BeautifulSoup`
			`import re`
			`import requests`
			`import queue`
			`import threading`
			`import os`
			`import shutil`

			`regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"`


			`def extract_row(soup, bp):`
			`l = []`
			`ci = {}`
			`tr_tags = soup.find_all('tr')`
			`for row in tr_tags:`
			`ci = {}`
			`for td in row.find_all('td'):`
			`if td.a:`
			`ci['links']=f"{bp}{td.a.get('href')}"`
			`if re.match(regex,td.text):`
			`ci['date']= td.text.strip()`
			`if('date' in ci):`
			`l.append(ci)`
			`return l`


			`def worker(q):`
			`work = True`
			`while work:`
			`item =q.get()`
			`if item == "DONE":`
			`work = False`
			`q.put(item)`
			`else:`
			`html_doc = requests.get(item['links']).text`
			`soup = BeautifulSoup(html_doc, 'html.parser')`
			`data =extract_row(soup, item['links'])`
			`for f in data:`
			`name = f['links'].split("/")[-1]`
			`r = requests.get(f["links"])`
			`print(f"DOWNLOADING {f['links']} into download/{name}")`
			`with open(os.path.join("download", name), 'wb') as fd:`
			`fd.write(r.content)`


			`class PDBDownloader:`
			`"""`
			`PDB Downloader is a class that tries to download all the pdbs into a local download folder.`
			`It uses ftp.ebi.ac.uk but the https website since the ftp not works`

			`"""`
			`def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:`
			`self.base_path = basePath`
			`self.download_list = queue.Queue()`
			`self.number_of_thread = number_of_thread`

			`def get_file_to_downloads(self, max_item =-1):`
			`""" This method run in parallels download of all the folders under`
			`pub/databases/pdb/data/structures/divided/pdb/`
			`"""`
			`shutil.rmtree("download")`
			`os.mkdir("download")`



			`total_item = 0`
			`html_doc = r = requests.get(self.base_path).text`
			`soup = BeautifulSoup(html_doc, 'html.parser')`
			`data = extract_row(soup, self.base_path)`
			`for item in data:`
			`self.download_list.put(item)`
			`total_item +=1`
			`if max_item>0 and total_item > max_item:`
			`break`
			`self.download_list.put("DONE")`

			`workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]`
			`for i in workers:`
			`i.start()`
			`i.join()`

			`def get_snapshot(self):`
			`html_doc = requests.get(self.base_path).text`
			`soup = BeautifulSoup(html_doc, 'html.parser')`
			`data = extract_row(soup, self.base_path)`
			`snapshots = []`

			`for item in data:`
			`snapshots.append(item)`
			`print(f"get info for {item['links']}")`
			`l_doc = requests.get(item['links']).text`
			`s = BeautifulSoup(l_doc, 'html.parser')`
			`d = extract_row(s, item['links'])`
			`snapshots+=d`

			`with gzip.open("snapshot.gz", "w") as w:`
			`for item in snapshots:`
			`w.write(json.dumps(item).encode())`
			`w.write("\n".encode())`