From 4c1474e6938cdd9cef37cec895b8aa232ac59758 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Fri, 20 Aug 2021 17:03:30 +0200 Subject: [PATCH] Dealing with #6859#note-2: we have to decode URLs to avoid & and other chars encoded becasue of the original XML representation of data --- .../dhp/oa/graph/raw/OafToOafMapper.java | 9 +++ .../dhp/oa/graph/raw/OdfToOafMapper.java | 18 ++++- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 35 +++++++++ .../dnetlib/dhp/oa/graph/raw/encoded-url.xml | 40 ++++++++++ .../dhp/oa/graph/raw/encoded-url_odf.xml | 75 +++++++++++++++++++ 5 files changed, 173 insertions(+), 4 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url_odf.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 2b49a9dc1..5f9491029 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -164,6 +166,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { .filter(n -> StringUtils.isNotBlank(n.getText())) .map(n -> n.getText().trim()) .filter(u -> u.startsWith("http")) + .map(s -> { + try { + return URLDecoder.decode(s, "UTF-8"); + } catch (UnsupportedEncodingException e) { + return s; + } + }) .distinct() .collect(Collectors.toCollection(ArrayList::new))); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 02aab4f16..a365ac9aa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.*; import java.util.stream.Collectors; @@ -137,17 +139,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Set url = new HashSet<>(); for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='URL']")) { - url.add(((Node) o).getText().trim()); + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='landingPage']")) { - url.add(((Node) o).getText().trim()); + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='URL']")) { - url.add(((Node) o).getText().trim()); + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { - url.add(((Node) o).getText().trim()); + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { @@ -163,6 +165,14 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { return Arrays.asList(instance); } + protected String trimAndDecodeUrl(String url){ + try { + return URLDecoder.decode(url.trim(), "UTF-8"); + } catch (UnsupportedEncodingException e) { + return url; + } + } + @Override protected List> prepareSources(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 5228d1d02..cc903b49e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Objects; import java.util.Optional; +import com.fasterxml.jackson.core.JsonProcessingException; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.DocumentException; @@ -759,6 +760,40 @@ class MappersTest { assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); } + + @Test + void testXMLEncodedURL() throws IOException, DocumentException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml"))); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertTrue(p.getInstance().size() > 0); + + String decoded = "https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS"; + assertEquals(decoded, p.getInstance().get(0).getUrl().get(0)); + } + + @Test + void testXMLEncodedURL_ODF() throws IOException, DocumentException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Dataset p = (Dataset) list.get(0); + assertTrue(p.getInstance().size() > 0); + for(String url : p.getInstance().get(0).getUrl()){ + System.out.println(url); + assertTrue(!url.contains("&")); + } + } + private void assertValidId(final String id) { // System.out.println(id); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url.xml new file mode 100644 index 000000000..c9cafea4f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url.xml @@ -0,0 +1,40 @@ + + +
+ r3c4b2081b22::0001f1a60b1acbb28dd66b4f6c7d881f + https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS + 2021-06-24T10:40:25.346Z + r3c4b2081b22 + 2021-06-24T10:40:55.153Z +
+ + https://www.ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e5af388993&appId=PPGMS + Progress report on preparing Strategic plan for Technology transfer from EPPL/FMPI CU + OPEN + + Documents, reports + 0034 + Progress report on preparation process of the technology transfer plan for CU. + corda__h2020::692335 + + + + + file%3A%2F%2F%2Fvar%2Flib%2Fdnet%2Fdata%2Fopendata%2Fcordis-h2020projectDeliverables.tsv + + + + + + + false + false + 0.9 + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url_odf.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url_odf.xml new file mode 100644 index 000000000..b970605a6 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/encoded-url_odf.xml @@ -0,0 +1,75 @@ + + + + opentrials__::0000bf8e63d3d7e6b88421eabafae3f6 + feabb67c-1fd1-423b-aec6-606d04ce53c6 + 2019-03-27T15:15:22.22Z + opentrials__ + 2019-04-17T16:04:20.586Z + + + + https://clinicaltrials.gov/ct2/show/NCT02321059&test=yes + + http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059&test=yes + NCT02321059 + + + + Jensen, Kristian K + + + + Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia + + nct + + Denmark + + 0037 + + Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer. + + + OPEN + 0037 + 2014-11-11 + + + + + false + false + 0.9 + + + + + + + + + file:///var/lib/dnet/data/opentrials/opentrials.csv + + + + + + + false + false + 0.9 + + + + + \ No newline at end of file