imported uniprot validation and parsing scripts

This commit is contained in:
Sandro La Bruzzo 2023-09-22 14:22:20 +02:00
parent 7decdade98
commit 6d119457fc
3 changed files with 50 additions and 7 deletions

View File

@ -3,12 +3,18 @@ from pdb.pdb_metadata_extractor import MetadataExctractor
from pdb.pdb_validator import validate
from uniprot.download import UniprotSwissDownloader
from uniprot.metadata import MetadataExctractor as ME
from uniprot.uniprot_validator import validate as validate_uniprot
if __name__ == '__main__':
u = UniprotSwissDownloader()
#u.download()
k = ME()
k.extract_metadata()
error, valid, total, error_record = validate_uniprot()
print(error)
print(f"Valid {valid}/{total}")
print(f"Error {error_record}/{total}")
# k.extract_metadata()
# p = PDBDownloader()
# p.get_file_to_downloads(max_item=4)

View File

@ -15,6 +15,7 @@ class MetadataExctractor:
def get_metadata(self) :
p = {}
subjCount =0
for line in self.current_file:
l = line.decode("utf-8").strip()
if l.startswith("AC"):
@ -34,11 +35,12 @@ class MetadataExctractor:
elif l.startswith("OS "):
p['organism_species'] =l[4:-1].strip()
elif l.startswith("OC "):
elif l.startswith("OC ") and subjCount < 20:
s = l[4:-1].strip().split(';')
subjects = p.get('subjects', [])
for sub in s:
subjects.append(sub.strip())
subjCount += 1
p['subjects']= subjects
elif l.startswith("RX "):
references = [c.strip() for c in l[4:-1].strip().split(';')]
@ -51,7 +53,11 @@ class MetadataExctractor:
if len(relations):
p["references"] = relations
elif l.startswith("//"):
yield json.dumps(p)
s =json.dumps(p)
subjCount = 0
del p
p = {}
yield s
def extract_metadata(self, output_path="uniprot_metadata"):
@ -65,11 +71,9 @@ class MetadataExctractor:
i +=1
w.write(item.encode())
w.write("\n".encode())
if i % 10000==0:
if i % 100000==0:
part +=1
w.flush()
w.close()
print(f"PARSED {i}")
w = gzip.open(f"dump_0{part}.gz", "w")
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")

View File

@ -0,0 +1,33 @@
import json
import gzip
import glob
def validate(input_path="uniprot_metadata") :
error = {}
valid = 0
total = 0
error_record = 0
for i in glob.glob(f"{input_path}/dump*.gz"):
with gzip.open(i) as f:
for line in f:
data = json.loads(line.decode("utf-8"))
e = False
if "dates" not in data or data['dates'] is None or len(data['dates']) ==0:
error['MissingDate'] = error.get('MissingDate', 0)+1
e= True
if 'pid' not in data and len(data['pid'])==0:
error['MissingLinks'] = error.get('MissingLinks', 0)+1
e= True
if "title" not in data or data['title'] is None:
error['MissingTitle'] = error.get('MissingTitle', 0)+1
e= True
if 'subjects' not in data or data['subjects'] is None:
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
total += 1
if e:
error_record +=1
else:
valid +=1
return (error, valid, total, error_record)