imported uniprot validation and parsing scripts
This commit is contained in:
parent
7decdade98
commit
6d119457fc
8
main.py
8
main.py
|
@ -3,12 +3,18 @@ from pdb.pdb_metadata_extractor import MetadataExctractor
|
||||||
from pdb.pdb_validator import validate
|
from pdb.pdb_validator import validate
|
||||||
from uniprot.download import UniprotSwissDownloader
|
from uniprot.download import UniprotSwissDownloader
|
||||||
from uniprot.metadata import MetadataExctractor as ME
|
from uniprot.metadata import MetadataExctractor as ME
|
||||||
|
from uniprot.uniprot_validator import validate as validate_uniprot
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
u = UniprotSwissDownloader()
|
u = UniprotSwissDownloader()
|
||||||
#u.download()
|
#u.download()
|
||||||
k = ME()
|
k = ME()
|
||||||
k.extract_metadata()
|
error, valid, total, error_record = validate_uniprot()
|
||||||
|
print(error)
|
||||||
|
print(f"Valid {valid}/{total}")
|
||||||
|
print(f"Error {error_record}/{total}")
|
||||||
|
# k.extract_metadata()
|
||||||
|
|
||||||
|
|
||||||
# p = PDBDownloader()
|
# p = PDBDownloader()
|
||||||
# p.get_file_to_downloads(max_item=4)
|
# p.get_file_to_downloads(max_item=4)
|
||||||
|
|
|
@ -15,6 +15,7 @@ class MetadataExctractor:
|
||||||
|
|
||||||
def get_metadata(self) :
|
def get_metadata(self) :
|
||||||
p = {}
|
p = {}
|
||||||
|
subjCount =0
|
||||||
for line in self.current_file:
|
for line in self.current_file:
|
||||||
l = line.decode("utf-8").strip()
|
l = line.decode("utf-8").strip()
|
||||||
if l.startswith("AC"):
|
if l.startswith("AC"):
|
||||||
|
@ -34,11 +35,12 @@ class MetadataExctractor:
|
||||||
elif l.startswith("OS "):
|
elif l.startswith("OS "):
|
||||||
p['organism_species'] =l[4:-1].strip()
|
p['organism_species'] =l[4:-1].strip()
|
||||||
|
|
||||||
elif l.startswith("OC "):
|
elif l.startswith("OC ") and subjCount < 20:
|
||||||
s = l[4:-1].strip().split(';')
|
s = l[4:-1].strip().split(';')
|
||||||
subjects = p.get('subjects', [])
|
subjects = p.get('subjects', [])
|
||||||
for sub in s:
|
for sub in s:
|
||||||
subjects.append(sub.strip())
|
subjects.append(sub.strip())
|
||||||
|
subjCount += 1
|
||||||
p['subjects']= subjects
|
p['subjects']= subjects
|
||||||
elif l.startswith("RX "):
|
elif l.startswith("RX "):
|
||||||
references = [c.strip() for c in l[4:-1].strip().split(';')]
|
references = [c.strip() for c in l[4:-1].strip().split(';')]
|
||||||
|
@ -51,7 +53,11 @@ class MetadataExctractor:
|
||||||
if len(relations):
|
if len(relations):
|
||||||
p["references"] = relations
|
p["references"] = relations
|
||||||
elif l.startswith("//"):
|
elif l.startswith("//"):
|
||||||
yield json.dumps(p)
|
s =json.dumps(p)
|
||||||
|
subjCount = 0
|
||||||
|
del p
|
||||||
|
p = {}
|
||||||
|
yield s
|
||||||
|
|
||||||
|
|
||||||
def extract_metadata(self, output_path="uniprot_metadata"):
|
def extract_metadata(self, output_path="uniprot_metadata"):
|
||||||
|
@ -65,11 +71,9 @@ class MetadataExctractor:
|
||||||
i +=1
|
i +=1
|
||||||
w.write(item.encode())
|
w.write(item.encode())
|
||||||
w.write("\n".encode())
|
w.write("\n".encode())
|
||||||
if i % 10000==0:
|
if i % 100000==0:
|
||||||
part +=1
|
part +=1
|
||||||
w.flush()
|
w.flush()
|
||||||
w.close()
|
w.close()
|
||||||
print(f"PARSED {i}")
|
print(f"PARSED {i}")
|
||||||
w = gzip.open(f"dump_0{part}.gz", "w")
|
w = gzip.open(f"{output_path}/dump_0{part}.gz", "w")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
import json
|
||||||
|
import gzip
|
||||||
|
import glob
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def validate(input_path="uniprot_metadata") :
|
||||||
|
error = {}
|
||||||
|
valid = 0
|
||||||
|
total = 0
|
||||||
|
error_record = 0
|
||||||
|
for i in glob.glob(f"{input_path}/dump*.gz"):
|
||||||
|
with gzip.open(i) as f:
|
||||||
|
for line in f:
|
||||||
|
data = json.loads(line.decode("utf-8"))
|
||||||
|
e = False
|
||||||
|
if "dates" not in data or data['dates'] is None or len(data['dates']) ==0:
|
||||||
|
error['MissingDate'] = error.get('MissingDate', 0)+1
|
||||||
|
e= True
|
||||||
|
if 'pid' not in data and len(data['pid'])==0:
|
||||||
|
error['MissingLinks'] = error.get('MissingLinks', 0)+1
|
||||||
|
e= True
|
||||||
|
if "title" not in data or data['title'] is None:
|
||||||
|
error['MissingTitle'] = error.get('MissingTitle', 0)+1
|
||||||
|
e= True
|
||||||
|
if 'subjects' not in data or data['subjects'] is None:
|
||||||
|
error['MissingKeywords'] = error.get('MissingKeywords', 0)+1
|
||||||
|
total += 1
|
||||||
|
if e:
|
||||||
|
error_record +=1
|
||||||
|
else:
|
||||||
|
valid +=1
|
||||||
|
return (error, valid, total, error_record)
|
Loading…
Reference in New Issue