forked from D-Net/dnet-hadoop
Dealing with #6859#note-2: we have to decode URLs to avoid & and other chars encoded becasue of the original XML representation of data
This commit is contained in:
parent
5f8ccbc365
commit
4c1474e693
|
@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -164,6 +166,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
||||
.map(n -> n.getText().trim())
|
||||
.filter(u -> u.startsWith("http"))
|
||||
.map(s -> {
|
||||
try {
|
||||
return URLDecoder.decode(s, "UTF-8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return s;
|
||||
}
|
||||
})
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)));
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -137,17 +139,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
final Set<String> url = new HashSet<>();
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='URL']")) {
|
||||
url.add(((Node) o).getText().trim());
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='landingPage']")) {
|
||||
url.add(((Node) o).getText().trim());
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='URL']")) {
|
||||
url.add(((Node) o).getText().trim());
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) {
|
||||
url.add(((Node) o).getText().trim());
|
||||
url.add(trimAndDecodeUrl(((Node) o).getText().trim()));
|
||||
}
|
||||
for (final Object o : doc
|
||||
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
|
||||
|
@ -163,6 +165,14 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
return Arrays.asList(instance);
|
||||
}
|
||||
|
||||
protected String trimAndDecodeUrl(String url){
|
||||
try {
|
||||
return URLDecoder.decode(url.trim(), "UTF-8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.util.List;
|
|||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.DocumentException;
|
||||
|
@ -759,6 +760,40 @@ class MappersTest {
|
|||
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testXMLEncodedURL() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
assertTrue(p.getInstance().size() > 0);
|
||||
|
||||
String decoded = "https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS";
|
||||
assertEquals(decoded, p.getInstance().get(0).getUrl().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testXMLEncodedURL_ODF() throws IOException, DocumentException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
|
||||
final Dataset p = (Dataset) list.get(0);
|
||||
assertTrue(p.getInstance().size() > 0);
|
||||
for(String url : p.getInstance().get(0).getUrl()){
|
||||
System.out.println(url);
|
||||
assertTrue(!url.contains("&"));
|
||||
}
|
||||
}
|
||||
|
||||
private void assertValidId(final String id) {
|
||||
// System.out.println(id);
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:oaf="http://namespace.openaire.eu/oaf">
|
||||
<header>
|
||||
<dri:objIdentifier>r3c4b2081b22::0001f1a60b1acbb28dd66b4f6c7d881f</dri:objIdentifier>
|
||||
<dri:recordIdentifier>https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2021-06-24T10:40:25.346Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>r3c4b2081b22</oaf:datasourceprefix>
|
||||
<dr:dateOfTransformation>2021-06-24T10:40:55.153Z</dr:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<dc:identifier>https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS</dc:identifier>
|
||||
<dc:title>Progress report on preparing Strategic plan for Technology transfer from EPPL/FMPI CU</dc:title>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="openaire____::participantPortal" name="European Commission Participant Portal"/>
|
||||
<dc:type>Documents, reports</dc:type>
|
||||
<dr:CobjCategory type="publication">0034</dr:CobjCategory>
|
||||
<dc:description>Progress report on preparation process of the technology transfer plan for CU.</dc:description>
|
||||
<oaf:projectid validationDate="2019-11-26 09:39:59">corda__h2020::692335</oaf:projectid>
|
||||
<oaf:collectedFrom id="re3data_____::r3d100011728" name="European Union Open Data Portal"/>
|
||||
</metadata>
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2021-06-24T10:40:25.346Z">
|
||||
<baseURL>file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fopendata%2Fcordis-h2020projectDeliverables.tsv</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk"
|
||||
classname="sysimport:crosswalk" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</record>
|
|
@ -0,0 +1,75 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<oai:header>
|
||||
<dri:objIdentifier>opentrials__::0000bf8e63d3d7e6b88421eabafae3f6</dri:objIdentifier>
|
||||
<dri:recordIdentifier>feabb67c-1fd1-423b-aec6-606d04ce53c6</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>opentrials__</oaf:datasourceprefix>
|
||||
<dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
|
||||
</oai:header>
|
||||
<oai:metadata>
|
||||
<resource xmlns="http://datacite.org/schema/kernel-3"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
|
||||
<identifier identifierType="URL">https://clinicaltrials.gov/ct2/show/NCT02321059&test=yes</identifier>
|
||||
<alternateIdentifiers>
|
||||
<alternateIdentifier alternateIdentifierType="URL">http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059&test=yes</alternateIdentifier>
|
||||
<alternateIdentifier alternateIdentifierType="nct">NCT02321059</alternateIdentifier>
|
||||
</alternateIdentifiers>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Jensen, Kristian K</creatorName>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia</title>
|
||||
</titles>
|
||||
<publisher>nct</publisher>
|
||||
<geoLocations>
|
||||
<geoLocationPlace>Denmark</geoLocationPlace>
|
||||
</geoLocations>
|
||||
<resourceType resourceTypeGeneral="clinicalTrial">0037</resourceType>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract">Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer.</description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<dr:CobjCategory type="dataset">0037</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2014-11-11</oaf:dateAccepted>
|
||||
<oaf:hostedBy id="openaire____::opentrials" name="OpenTrials"/>
|
||||
<oaf:collectedFrom id="openaire____::opentrials" name="OpenTrials"/>
|
||||
<oaf:about>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction
|
||||
classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</oaf:about>
|
||||
</oai:metadata>
|
||||
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2019-03-27T15:15:22.22Z">
|
||||
<baseURL>file:///var/lib/dnet/data/opentrials/opentrials.csv</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</oai:record>
|
Loading…
Reference in New Issue