imported pdb preprocessing scripts

This commit is contained in:
Sandro La Bruzzo 2023-09-20 15:16:12 +02:00
parent 1cb03ea7a0
commit a9712c63d1
7 changed files with 234 additions and 0 deletions

2
.gitignore vendored
View File

@ -10,6 +10,8 @@ __pycache__/
# Distribution / packaging
.Python
build/
download
metadata
develop-eggs/
dist/
downloads/

14
main.py Normal file
View File

@ -0,0 +1,14 @@
from pdb.pdb_download import PDBDownloader
from pdb.pdb_metadata_extractor import MetadataExctractor
from pdb.pdb_validator import validate
if __name__ == '__main__':
p = PDBDownloader()
p.get_file_to_downloads(max_item=4)
m = MetadataExctractor()
m.extract_metadata()
error, valid, total, error_record = validate()
print(error)
print(f"Valid {valid}/{total}")
print(f"Error {error_record}/{total}")

0
pdb/__init__.py Normal file
View File

103
pdb/pdb_download.py Normal file
View File

@ -0,0 +1,103 @@
import gzip
import json
import os
from bs4 import BeautifulSoup
import re
import requests
import queue
import threading
import os
import shutil
regex = r"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}"
def extract_row(soup, bp):
l = []
ci = {}
tr_tags = soup.find_all('tr')
for row in tr_tags:
ci = {}
for td in row.find_all('td'):
if td.a:
ci['links']=f"{bp}{td.a.get('href')}"
if re.match(regex,td.text):
ci['date']= td.text.strip()
if('date' in ci):
l.append(ci)
return l
def worker(q):
work = True
while work:
item =q.get()
if item == "DONE":
work = False
q.put(item)
else:
html_doc = requests.get(item['links']).text
soup = BeautifulSoup(html_doc, 'html.parser')
data =extract_row(soup, item['links'])
for f in data:
name = f['links'].split("/")[-1]
r = requests.get(f["links"])
print(f"DOWNLOADING {f['links']} into download/{name}")
with open(os.path.join("download", name), 'wb') as fd:
fd.write(r.content)
class PDBDownloader:
"""
PDB Downloader is a class that tries to download all the pdbs into a local download folder.
It uses ftp.ebi.ac.uk but the https website since the ftp not works
"""
def __init__(self, basePath="https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/", number_of_thread=5) -> None:
self.base_path = basePath
self.download_list = queue.Queue()
self.number_of_thread = number_of_thread
def get_file_to_downloads(self, max_item =-1):
""" This method run in parallels download of all the folders under
pub/databases/pdb/data/structures/divided/pdb/
"""
shutil.rmtree("download")
os.mkdir("download")
total_item = 0
html_doc = r = requests.get(self.base_path).text
soup = BeautifulSoup(html_doc, 'html.parser')
data = extract_row(soup, self.base_path)
for item in data:
self.download_list.put(item)
total_item +=1
if max_item>0 and total_item > max_item:
break
self.download_list.put("DONE")
workers = [threading.Thread(target=worker, args=[self.download_list], daemon=True) for i in range(self.number_of_thread)]
for i in workers:
i.start()
i.join()
def get_snapshot(self):
html_doc = requests.get(self.base_path).text
soup = BeautifulSoup(html_doc, 'html.parser')
data = extract_row(soup, self.base_path)
snapshots = []
for item in data:
snapshots.append(item)
print(f"get info for {item['links']}")
l_doc = requests.get(item['links']).text
s = BeautifulSoup(l_doc, 'html.parser')
d = extract_row(s, item['links'])
snapshots+=d
with gzip.open("snapshot.gz", "w") as w:
for item in snapshots:
w.write(json.dumps(item).encode())
w.write("\n".encode())

View File

@ -0,0 +1,68 @@
import gzip
import json
import glob
import datetime
import os
import shutil
class MetadataExctractor:
def __init__(self) -> None:
pass
def get_metadata(self, path) :
with gzip.open(path) as f:
p = {}
for line in f:
l = line.decode("utf-8").strip()
if l.startswith("HEADER"):
p['classification'] = l[10:49].strip().capitalize()
p['pdb'] = l[62:].strip()
p['deposition_date'] = d =datetime.datetime.strptime(l[50:59],"%d-%b-%y" ).strftime("%Y-%m-%d")
elif l.startswith("REMARK"):
break
elif l.startswith("TITLE"):
if "title" in p:
p['title'] =f"{p['title']} {l[10:].strip().capitalize()}"
else:
p['title'] =l[10:].strip().capitalize()
elif l.startswith("KEYWDS"):
if 'Keywords' in p:
p['Keywords'] =p['Keywords'] + [s.strip().capitalize() for s in l[11:].split(",")]
else :
p['Keywords'] = [s.capitalize() for s in l[11:].split(",")]
elif l.startswith("AUTHOR"):
if 'authors' in p:
p['authors'] =p['authors'] + [s.strip().capitalize() for s in l[10:].split(",")]
else :
p['authors'] = [s.strip().capitalize() for s in l[10:].split(",")]
elif l.startswith("JRNL"):
if 'PMID' in l:
pmid = l[l.find("PMID")+5:].strip()
p['pmid']= pmid
if 'DOI' in l:
doi = l[l.find("DOI")+5:].strip()
p['doi']= doi
if l.startswith("REMARK"):
break
return(json.dumps(p))
def extract_metadata(self, input_path="download", output_path="metadata"):
if (os.path.exists(output_path) and os.path.isdir(output_path)):
shutil.rmtree(output_path)
os.mkdir(output_path)
part = 0
i = 0
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
for item in glob.glob(f"{input_path}/*.gz"):
i +=1
if i % 10000 ==0:
part +=1
w.flush()
w.close()
print(f"PARSED {i}")
w = gzip.open(f"dump_0{part}.gz", "w")
w.write(self.get_metadata(item).encode("utf-8"))
w.write("\n".encode("utf-8"))

39
pdb/pdb_validator.py Normal file
View File

@ -0,0 +1,39 @@
import json
import gzip
import glob
def validate(input_path="metadata") :
error = {}
valid = 0
total = 0
error_record = 0
for i in glob.glob(f"{input_path}/dump*.gz"):
with gzip.open(i) as f:
for line in f:
data = json.loads(line.decode("utf-8"))
e = False
if "deposition_date" not in data or data['deposition_date'] is None:
error['MissingDate'] = error.get('MissingDate', 0)+1
e= True
if 'pmid' not in data and 'doi' not in data:
error['MissingLinks'] = error.get('MissingLinks', 0)+1
e= True
if 'authors' not in data or len(data['authors']) <1:
error['MissingAuthor'] = error.get('MissingAuthor', 0)+1
e= True
if "title" not in data or data['title'] is None:
error['MissingTitle'] = error.get('MissingTitle', 0)+1
e= True
if 'Keywords' not in data or data['Keywords'] is None:
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
if 'pdb' not in data or len(data['pdb']) != 4:
error['WRONGPDB'] = error.get('WRONGPDB', 0)+1
e= True
total += 1
if e:
error_record +=1
else:
valid +=1
return (error, valid, total, error_record)

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
beautifulsoup4==4.12.2
bs4==0.0.1
certifi==2023.7.22
charset-normalizer==3.2.0
idna==3.4
requests==2.31.0
soupsieve==2.5
urllib3==2.0.4