RegistriesOverlap/code/r3dCoverage.py

195 lines
7.9 KiB
Python

import glob
import xml.etree.ElementTree as ET
repo_list = glob.glob('../data/re3dataRecords/*.xml')
r3_coverage = {'identifier':0,
'name' : 0,
'nameLanguage':0,
'additionalName':0,
'additionalNameLanguage':0,
'repositoryURL':0,
'repositoryType':0,
'repositoryDescription':0,
'repositoryContent':0,
'recordURI':0,
'recordCount':0,
'subject':0,
'keyword':0,
'organizationId':0,
'organizationName':0,
'organizationAcronym':0,
'organizationNameLanguage':0,
'organizationCountry':0,
'organizationLocationLat':0,
'organizationLocationLong':0,
'organizationUrl':0,
'softwareName':0,
'softwareVersion':0,
'versioning':0,
'apiType':0,
'apiUrl':0,
'apiDocumentation':0,
'repository_status':0,
'startDate':0,
'lastUpdate':0,
'policyType':0,
'policyName':0,
'policyUrl':0,
'databaseAccessType':0,
'databaseAccessRestriction':0,
'dataUploadType' :0,
'dataUploadRestriction':0,
'databaseLicenseName':0,
'databaseLicenceUrl':0,
'dataUploadLicenceName':0,
'dataUploadLicenceUrl':0}
dbRestricted = 0
dataRestrictied = 0
for repo in repo_list:
root = ET.parse(repo).getroot()
id = root.find('.//{http://www.re3data.org/schema/2-2}re3data.orgIdentifier')
if not id is None and id.text != '':
r3_coverage ['identifier'] += 1
name = root.find('.//{http://www.re3data.org/schema/2-2}repositoryName')
if not name is None :
if name.text != '':
r3_coverage['name'] += 1
if 'language' in name.attrib and name.attrib['language']!= '':
r3_coverage['nameLanguage'] += 1
additionalname = root.find('.//{http://www.re3data.org/schema/2-2}additionalName')
if not additionalname is None:
if additionalname.text != '':
r3_coverage['additionalName'] += 1
if 'language' in additionalname.attrib and additionalname.attrib['language'] != '':
r3_coverage['additionalNameLanguage'] += 1
repoUrl = root.find('.//{http://www.re3data.org/schema/2-2}repositoryURL')
if not repoUrl is None and repoUrl.text != '':
r3_coverage['repositoryURL'] += 1
type = root.find('.//{http://www.re3data.org/schema/2-2}type')
if not type is None and type.text != '':
r3_coverage['repositoryType'] += 1
dex = root.find('.//{http://www.re3data.org/schema/2-2}description')
if not dex is None and dex.text != '':
r3_coverage['repositoryDescription'] += 1
ctype = root.find('.//{http://www.re3data.org/schema/2-2}contentType')
if not ctype is None and ctype.text != '':
r3_coverage['repositoryContent'] += 1
repoId = root.find('.//{http://www.re3data.org/schema/2-2}repositoryIdentifier')
if not repoId is None and repoId.text != '':
r3_coverage['recordURI'] += 1
subjs = root.findall('.//{http://www.re3data.org/schema/2-2}subject')
if len(subjs) > 0:
for s in subjs:
if s.text != '':
r3_coverage['subject'] += 1
break
kws = root.findall('.//{http://www.re3data.org/schema/2-2}keyword')
if len(kws) > 0:
for k in kws:
if k.text != '':
r3_coverage['keyword'] += 1
break
ists = root.findall('.//{http://www.re3data.org/schema/2-2}institution')
if len(ists) > 0:
name = 0
namel = 0
addname = 0
addnamel = 0
country = 0
url = 0
ide = 0
for i in ists:
iname = i.find('.//{http://www.re3data.org/schema/2-2}institutionName')
if iname is not None:
if iname.text != '':
name += 1
if 'language' in iname.attrib and iname.attrib['language'] != '':
namel += 1
aname = i.find('.//{http://www.re3data.org/schema/2-2}institutionAdditionalName')
if aname is not None:
if aname.text != '':
addname += 1
if 'language' in aname.attrib and aname.attrib['language'] != '':
addnamel += 1
if i.find('.//{http://www.re3data.org/schema/2-2}institutionCountry') is not None and i.find('.//{http://www.re3data.org/schema/2-2}institutionCountry').text != '':
country += 1
if i.find('.//{http://www.re3data.org/schema/2-2}institutionURL') is not None and i.find('.//{http://www.re3data.org/schema/2-2}institutionURL').text != '':
url += 1
if i.find('.//{http://www.re3data.org/schema/2-2}institutionIdentifier') is not None and i.find('.//{http://www.re3data.org/schema/2-2}institutionIdentifier').text != '':
ide += 1
if name > 0:
r3_coverage['organizationName'] += 1
if addname > 0:
r3_coverage['organizationAcronym'] += 1
if namel > 0 :
r3_coverage['organizationNameLanguage'] += 1
if country > 0:
r3_coverage['organizationCountry'] += 1
if url > 0:
r3_coverage['organizationUrl'] += 1
if ide > 0:
r3_coverage['organizationId'] += 1
software = root.findall('.//{http://www.re3data.org/schema/2-2}software')
if len(software) > 0:
for s in software:
if s.find('.//{http://www.re3data.org/schema/2-2}softwareName') is not None and s.find('.//{http://www.re3data.org/schema/2-2}softwareName').text != '':
r3_coverage['softwareName'] += 1
break
versioning = root.find('.//{http://www.re3data.org/schema/2-2}versioning')
if versioning is not None and versioning.text != '':
r3_coverage['versioning'] += 1
api = root.findall('.//{http://www.re3data.org/schema/2-2}api')
if len(api) > 0:
url = 0
type = 0
for a in api:
if a.text != '':
url += 1
if 'apiType' in a.attrib and a.attrib['apiType'] != '':
type += 1
if type > 0:
r3_coverage['apiType'] += 1
if url > 0:
r3_coverage['apiUrl'] += 1
startDate = root.find('.//{http://www.re3data.org/schema/2-2}startDate')
if not startDate is None and startDate.text != '':
r3_coverage['startDate'] += 1
lastUpdate = root.find('.//{http://www.re3data.org/schema/2-2}lastUpdate')
if not lastUpdate is None and lastUpdate.text != '':
r3_coverage['lastUpdate'] += 1
#questo vale perche' se esiste il campo policy allora sono mandatory sia policyName che policyUrl
policy = root.findall('.//{http://www.re3data.org/schema/2-2}policy')
r3_coverage['policyName'] += len(policy)
r3_coverage['policyUrl'] += len(policy)
dbAccess = root.find('.//{http://www.re3data.org/schema/2-2}databaseAccess')
if dbAccess is not None :
r3_coverage['databaseAccessType'] += 1
if dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessType').text =='restricted':
dbRestricted += 1
if dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessRestriction') is not None :
r3_coverage['databaseAccessRestriction'] += 1
dataUpload = root.find('.//{http://www.re3data.org/schema/2-2}dataUpload')
if dataUpload is not None:
r3_coverage['dataUploadType'] += 1
if dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadType').text == 'restricted':
dataRestrictied += 1
if dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadRestriction') is not None :
r3_coverage['dataUploadRestriction'] += 1
dbLicense = root.findall('.//{http://www.re3data.org/schema/2-2}databaseLicense')
if dbLicense is not None:
r3_coverage['databaseLicenseName'] += 1
r3_coverage['databaseLicenceUrl'] += 1
dataLicence = root.findall('.//{http://www.re3data.org/schema/2-2}dataUploadLicense')
if dataLicence is not None:
r3_coverage['dataUploadLicenceName'] += 1
r3_coverage['dataUploadLicenceUrl'] += 1
print(r3_coverage)
print(len(repo_list))
print(dataRestrictied)
print(dbRestricted)