1
0
Fork 0

Dealing with #6859#note-2: we have to decode URLs to avoid & and other chars encoded becasue of the original XML representation of data

This commit is contained in:
Alessia Bardi 2021-08-20 17:03:30 +02:00
parent 5f8ccbc365
commit 4c1474e693
5 changed files with 173 additions and 4 deletions

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -164,6 +166,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.filter(n -> StringUtils.isNotBlank(n.getText())) .filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim()) .map(n -> n.getText().trim())
.filter(u -> u.startsWith("http")) .filter(u -> u.startsWith("http"))
.map(s -> {
try {
return URLDecoder.decode(s, "UTF-8");
} catch (UnsupportedEncodingException e) {
return s;
}
})
.distinct() .distinct()
.collect(Collectors.toCollection(ArrayList::new))); .collect(Collectors.toCollection(ArrayList::new)));

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -137,17 +139,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final Set<String> url = new HashSet<>(); final Set<String> url = new HashSet<>();
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='URL']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='URL']")) {
url.add(((Node) o).getText().trim()); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='landingPage']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='landingPage']")) {
url.add(((Node) o).getText().trim()); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='URL']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='URL']")) {
url.add(((Node) o).getText().trim()); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
url.add(((Node) o).getText().trim()); url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
} }
for (final Object o : doc for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
@ -163,6 +165,14 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
return Arrays.asList(instance); return Arrays.asList(instance);
} }
protected String trimAndDecodeUrl(String url){
try {
return URLDecoder.decode(url.trim(), "UTF-8");
} catch (UnsupportedEncodingException e) {
return url;
}
}
@Override @Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) { protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ??? return new ArrayList<>(); // Not present in ODF ???

View File

@ -11,6 +11,7 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -759,6 +760,40 @@ class MappersTest {
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
} }
@Test
void testXMLEncodedURL() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertTrue(p.getInstance().size() > 0);
String decoded = "https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS";
assertEquals(decoded, p.getInstance().get(0).getUrl().get(0));
}
@Test
void testXMLEncodedURL_ODF() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Dataset p = (Dataset) list.get(0);
assertTrue(p.getInstance().size() > 0);
for(String url : p.getInstance().get(0).getUrl()){
System.out.println(url);
assertTrue(!url.contains("&amp;"));
}
}
private void assertValidId(final String id) { private void assertValidId(final String id) {
// System.out.println(id); // System.out.println(id);

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:oaf="http://namespace.openaire.eu/oaf">
<header>
<dri:objIdentifier>r3c4b2081b22::0001f1a60b1acbb28dd66b4f6c7d881f</dri:objIdentifier>
<dri:recordIdentifier>https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&amp;appId=PPGMS</dri:recordIdentifier>
<dri:dateOfCollection>2021-06-24T10:40:25.346Z</dri:dateOfCollection>
<oaf:datasourceprefix>r3c4b2081b22</oaf:datasourceprefix>
<dr:dateOfTransformation>2021-06-24T10:40:55.153Z</dr:dateOfTransformation>
</header>
<metadata>
<dc:identifier>https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&amp;appId=PPGMS</dc:identifier>
<dc:title>Progress report on preparing Strategic plan for Technology transfer from EPPL/FMPI CU</dc:title>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::participantPortal" name="European Commission Participant Portal"/>
<dc:type>Documents, reports</dc:type>
<dr:CobjCategory type="publication">0034</dr:CobjCategory>
<dc:description>Progress report on preparation process of the technology transfer plan for CU.</dc:description>
<oaf:projectid validationDate="2019-11-26 09:39:59">corda__h2020::692335</oaf:projectid>
<oaf:collectedFrom id="re3data_____::r3d100011728" name="European Union Open Data Portal"/>
</metadata>
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-06-24T10:40:25.346Z">
<baseURL>file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fopendata%2Fcordis-h2020projectDeliverables.tsv</baseURL>
<identifier/>
<datestamp/>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk"
classname="sysimport:crosswalk" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</record>

View File

@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<oai:header>
<dri:objIdentifier>opentrials__::0000bf8e63d3d7e6b88421eabafae3f6</dri:objIdentifier>
<dri:recordIdentifier>feabb67c-1fd1-423b-aec6-606d04ce53c6</dri:recordIdentifier>
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
<oaf:datasourceprefix>opentrials__</oaf:datasourceprefix>
<dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
</oai:header>
<oai:metadata>
<resource xmlns="http://datacite.org/schema/kernel-3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
<identifier identifierType="URL">https://clinicaltrials.gov/ct2/show/NCT02321059&amp;test=yes</identifier>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="URL">http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059&amp;test=yes</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="nct">NCT02321059</alternateIdentifier>
</alternateIdentifiers>
<creators>
<creator>
<creatorName>Jensen, Kristian K</creatorName>
</creator>
</creators>
<titles>
<title>Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia</title>
</titles>
<publisher>nct</publisher>
<geoLocations>
<geoLocationPlace>Denmark</geoLocationPlace>
</geoLocations>
<resourceType resourceTypeGeneral="clinicalTrial">0037</resourceType>
<descriptions>
<description descriptionType="Abstract">Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer.</description>
</descriptions>
</resource>
<oaf:accessrights>OPEN</oaf:accessrights>
<dr:CobjCategory type="dataset">0037</dr:CobjCategory>
<oaf:dateAccepted>2014-11-11</oaf:dateAccepted>
<oaf:hostedBy id="openaire____::opentrials" name="OpenTrials"/>
<oaf:collectedFrom id="openaire____::opentrials" name="OpenTrials"/>
<oaf:about>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction
classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</oaf:about>
</oai:metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2019-03-27T15:15:22.22Z">
<baseURL>file:///var/lib/dnet/data/opentrials/opentrials.csv</baseURL>
<identifier/>
<datestamp/>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</oai:record>