Compare commits
2 Commits
1cb03ea7a0
...
7decdade98
Author | SHA1 | Date |
---|---|---|
Sandro La Bruzzo | 7decdade98 | |
Sandro La Bruzzo | a9712c63d1 |
|
@ -10,6 +10,11 @@ __pycache__/
|
|||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
download
|
||||
metadata
|
||||
uniprot_metadata
|
||||
uniprot_sprot.dat.gz
|
||||
dump*.gz
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
from pdb.pdb_download import PDBDownloader
|
||||
from pdb.pdb_metadata_extractor import MetadataExctractor
|
||||
from pdb.pdb_validator import validate
|
||||
from uniprot.download import UniprotSwissDownloader
|
||||
from uniprot.metadata import MetadataExctractor as ME
|
||||
|
||||
if __name__ == '__main__':
|
||||
u = UniprotSwissDownloader()
|
||||
#u.download()
|
||||
k = ME()
|
||||
k.extract_metadata()
|
||||
|
||||
# p = PDBDownloader()
|
||||
# p.get_file_to_downloads(max_item=4)
|
||||
# m = MetadataExctractor()
|
||||
# m.extract_metadata()
|
||||
# error, valid, total, error_record = validate()
|
||||
# print(error)
|
||||
# print(f"Valid {valid}/{total}")
|
||||
# print(f"Error {error_record}/{total}")
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
import gzip
|
||||
import json
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import requests
|
||||
import queue
|
||||
import threading
|
||||
import os
|
||||
import shutil
|
||||
|
||||
regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
|
||||
|
||||
|
||||
def extract_row(soup, bp):
|
||||
l = []
|
||||
ci = {}
|
||||
tr_tags = soup.find_all('tr')
|
||||
for row in tr_tags:
|
||||
ci = {}
|
||||
for td in row.find_all('td'):
|
||||
if td.a:
|
||||
ci['links']=f"{bp}{td.a.get('href')}"
|
||||
if re.match(regex,td.text):
|
||||
ci['date']= td.text.strip()
|
||||
if('date' in ci):
|
||||
l.append(ci)
|
||||
return l
|
||||
|
||||
|
||||
def worker(q):
|
||||
work = True
|
||||
while work:
|
||||
item =q.get()
|
||||
if item == "DONE":
|
||||
work = False
|
||||
q.put(item)
|
||||
else:
|
||||
html_doc = requests.get(item['links']).text
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
data =extract_row(soup, item['links'])
|
||||
for f in data:
|
||||
name = f['links'].split("/")[-1]
|
||||
r = requests.get(f["links"])
|
||||
print(f"DOWNLOADING {f['links']} into download/{name}")
|
||||
with open(os.path.join("download", name), 'wb') as fd:
|
||||
fd.write(r.content)
|
||||
|
||||
|
||||
class PDBDownloader:
|
||||
"""
|
||||
PDB Downloader is a class that tries to download all the pdbs into a local download folder.
|
||||
It uses ftp.ebi.ac.uk but the https website since the ftp not works
|
||||
|
||||
"""
|
||||
def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
|
||||
self.base_path = basePath
|
||||
self.download_list = queue.Queue()
|
||||
self.number_of_thread = number_of_thread
|
||||
|
||||
def get_file_to_downloads(self, max_item =-1):
|
||||
""" This method run in parallels download of all the folders under
|
||||
pub/databases/pdb/data/structures/divided/pdb/
|
||||
"""
|
||||
shutil.rmtree("download")
|
||||
os.mkdir("download")
|
||||
|
||||
|
||||
|
||||
total_item = 0
|
||||
html_doc = r = requests.get(self.base_path).text
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
data = extract_row(soup, self.base_path)
|
||||
for item in data:
|
||||
self.download_list.put(item)
|
||||
total_item +=1
|
||||
if max_item>0 and total_item > max_item:
|
||||
break
|
||||
self.download_list.put("DONE")
|
||||
|
||||
workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
|
||||
for i in workers:
|
||||
i.start()
|
||||
i.join()
|
||||
|
||||
def get_snapshot(self):
|
||||
html_doc = requests.get(self.base_path).text
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
data = extract_row(soup, self.base_path)
|
||||
snapshots = []
|
||||
|
||||
for item in data:
|
||||
snapshots.append(item)
|
||||
print(f"get info for {item['links']}")
|
||||
l_doc = requests.get(item['links']).text
|
||||
s = BeautifulSoup(l_doc, 'html.parser')
|
||||
d = extract_row(s, item['links'])
|
||||
snapshots+=d
|
||||
|
||||
with gzip.open("snapshot.gz", "w") as w:
|
||||
for item in snapshots:
|
||||
w.write(json.dumps(item).encode())
|
||||
w.write("\n".encode())
|
|
@ -0,0 +1,68 @@
|
|||
import gzip
|
||||
import json
|
||||
import glob
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
|
||||
class MetadataExctractor:
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def get_metadata(self, path) :
|
||||
with gzip.open(path) as f:
|
||||
p = {}
|
||||
for line in f:
|
||||
l = line.decode("utf-8").strip()
|
||||
if l.startswith("HEADER"):
|
||||
|
||||
p['classification'] = l[10:49].strip().capitalize()
|
||||
p['pdb'] = l[62:].strip()
|
||||
p['deposition_date'] = datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
|
||||
elif l.startswith("REMARK"):
|
||||
break
|
||||
elif l.startswith("TITLE"):
|
||||
if "title" in p:
|
||||
p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
|
||||
else:
|
||||
p['title'] =l[10:].strip().capitalize()
|
||||
elif l.startswith("KEYWDS"):
|
||||
if 'Keywords' in p:
|
||||
p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")]
|
||||
else :
|
||||
p['Keywords'] = [s.capitalize() for s in l[11:].split(",")]
|
||||
elif l.startswith("AUTHOR"):
|
||||
if 'authors' in p:
|
||||
p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")]
|
||||
else :
|
||||
p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")]
|
||||
elif l.startswith("JRNL"):
|
||||
if 'PMID' in l:
|
||||
pmid = l[l.find("PMID")+5:].strip()
|
||||
p['pmid']= pmid
|
||||
if 'DOI' in l:
|
||||
doi = l[l.find("DOI")+5:].strip()
|
||||
p['doi']= doi
|
||||
if l.startswith("REMARK"):
|
||||
break
|
||||
return(json.dumps(p))
|
||||
|
||||
|
||||
def extract_metadata(self, input_path="download", output_path="metadata"):
|
||||
if (os.path.exists(output_path) and os.path.isdir(output_path)):
|
||||
shutil.rmtree(output_path)
|
||||
os.mkdir(output_path)
|
||||
part = 0
|
||||
i = 0
|
||||
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
|
||||
for item in glob.glob(f"{input_path}/*.gz"):
|
||||
i +=1
|
||||
if i % 10000 ==0:
|
||||
part +=1
|
||||
w.flush()
|
||||
w.close()
|
||||
print(f"PARSED {i}")
|
||||
w = gzip.open(f"dump_0{part}.gz", "w")
|
||||
w.write(self.get_metadata(item).encode("utf-8"))
|
||||
w.write("\n".encode("utf-8"))
|
|
@ -0,0 +1,39 @@
|
|||
import json
|
||||
import gzip
|
||||
import glob
|
||||
|
||||
|
||||
|
||||
def validate(input_path="metadata") :
|
||||
error = {}
|
||||
valid = 0
|
||||
total = 0
|
||||
error_record = 0
|
||||
for i in glob.glob(f"{input_path}/dump*.gz"):
|
||||
with gzip.open(i) as f:
|
||||
for line in f:
|
||||
data = json.loads(line.decode("utf-8"))
|
||||
e = False
|
||||
if "deposition_date" not in data or data['deposition_date'] is None:
|
||||
error['MissingDate'] = error.get('MissingDate', 0)+1
|
||||
e= True
|
||||
if 'pmid' not in data and 'doi' not in data:
|
||||
error['MissingLinks'] = error.get('MissingLinks', 0)+1
|
||||
e= True
|
||||
if 'authors' not in data or len(data['authors']) <1:
|
||||
error['MissingAuthor'] = error.get('MissingAuthor', 0)+1
|
||||
e= True
|
||||
if "title" not in data or data['title'] is None:
|
||||
error['MissingTitle'] = error.get('MissingTitle', 0)+1
|
||||
e= True
|
||||
if 'Keywords' not in data or data['Keywords'] is None:
|
||||
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
|
||||
if 'pdb' not in data or len(data['pdb']) != 4:
|
||||
error['WRONGPDB'] = error.get('WRONGPDB', 0)+1
|
||||
e= True
|
||||
total += 1
|
||||
if e:
|
||||
error_record +=1
|
||||
else:
|
||||
valid +=1
|
||||
return (error, valid, total, error_record)
|
|
@ -0,0 +1,8 @@
|
|||
beautifulsoup4==4.12.2
|
||||
bs4==0.0.1
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.2.0
|
||||
idna==3.4
|
||||
requests==2.31.0
|
||||
soupsieve==2.5
|
||||
urllib3==2.0.4
|
|
@ -0,0 +1,24 @@
|
|||
import requests
|
||||
import sys
|
||||
class UniprotSwissDownloader():
|
||||
def __init__(self, url = "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz") -> None:
|
||||
self.url = url
|
||||
|
||||
|
||||
def download(self):
|
||||
r= requests.get(self.url, stream=True)
|
||||
total_length = r.headers.get('content-length')
|
||||
|
||||
with open("uniprot_sprot.dat.gz", 'wb') as f:
|
||||
if total_length is None: # no content length header
|
||||
f.write(r.content)
|
||||
else:
|
||||
dl = 0
|
||||
total_length = int(total_length)
|
||||
for data in r.iter_content(chunk_size=4096):
|
||||
dl += len(data)
|
||||
f.write(data)
|
||||
done = int(50 * dl / total_length)
|
||||
sys.stdout.write("\rDownloading[%s%s]" % ('=' * done, ' ' * (50-done)) )
|
||||
sys.stdout.flush()
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import gzip
|
||||
import json
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
import re
|
||||
|
||||
regex = r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
|
||||
|
||||
class MetadataExctractor:
|
||||
|
||||
def __init__(self, path="uniprot_sprot.dat.gz") -> None:
|
||||
print(f"Open Path {path}")
|
||||
self.current_file = gzip.open(path)
|
||||
|
||||
def get_metadata(self) :
|
||||
p = {}
|
||||
for line in self.current_file:
|
||||
l = line.decode("utf-8").strip()
|
||||
if l.startswith("AC"):
|
||||
p['pid'] =l[4:-1]
|
||||
elif l.startswith("DT "):
|
||||
cd ={}
|
||||
k = l[4:-1].split(',')
|
||||
cd['date'] =datetime.datetime.strptime(k[0].strip(),"%d-%b-%Y" ).strftime("%Y-%m-%d")
|
||||
cd['date_info'] = k[1].strip()
|
||||
dates = p.get("dates", [])
|
||||
dates.append(cd)
|
||||
p['dates']= dates
|
||||
elif l.startswith("DE "):
|
||||
k = l[4:-1]
|
||||
if 'RecName: Full=' in k:
|
||||
p['title'] = k[k.find('=')+1:]
|
||||
elif l.startswith("OS "):
|
||||
p['organism_species'] =l[4:-1].strip()
|
||||
|
||||
elif l.startswith("OC "):
|
||||
s = l[4:-1].strip().split(';')
|
||||
subjects = p.get('subjects', [])
|
||||
for sub in s:
|
||||
subjects.append(sub.strip())
|
||||
p['subjects']= subjects
|
||||
elif l.startswith("RX "):
|
||||
references = [c.strip() for c in l[4:-1].strip().split(';')]
|
||||
relations = []
|
||||
if len(references):
|
||||
for r in references:
|
||||
dd = r.split("=")
|
||||
if len(dd)==2 and dd[0].lower().strip() in ['pubmed', 'doi']:
|
||||
relations.append({dd[0]:dd[1]})
|
||||
if len(relations):
|
||||
p["references"] = relations
|
||||
elif l.startswith("//"):
|
||||
yield json.dumps(p)
|
||||
|
||||
|
||||
def extract_metadata(self, output_path="uniprot_metadata"):
|
||||
if (os.path.exists(output_path) and os.path.isdir(output_path)):
|
||||
shutil.rmtree(output_path)
|
||||
os.mkdir(output_path)
|
||||
part = 0
|
||||
i = 0
|
||||
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
|
||||
for item in self.get_metadata():
|
||||
i +=1
|
||||
w.write(item.encode())
|
||||
w.write("\n".encode())
|
||||
if i % 10000==0:
|
||||
part +=1
|
||||
w.flush()
|
||||
w.close()
|
||||
print(f"PARSED {i}")
|
||||
w = gzip.open(f"dump_0{part}.gz", "w")
|
||||
|
||||
|
Loading…
Reference in New Issue