From bdece15ca0a84272ca17328c27abea8a6e2da521 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 30 Jul 2020 16:13:38 +0200 Subject: [PATCH 1/7] blacklist of nsprefix --- .../raw/MigrateDbEntitiesApplication.java | 33 +++++-- .../raw/common/VerifyNsPrefixPredicate.java | 62 +++++++++++++ .../graph/migrate_db_entities_parameters.json | 6 ++ .../oa/graph/raw_all/oozie_app/workflow.xml | 8 +- .../graph/raw_claims/oozie_app/workflow.xml | 7 +- .../oa/graph/raw_db/oozie_app/workflow.xml | 8 +- .../oa/graph/raw_step1/oozie_app/workflow.xml | 6 ++ .../common/VerifyNsPrefixPredicateTest.java | 92 +++++++++++++++++++ 8 files changed, 212 insertions(+), 10 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index da2ba47237..87c935d835 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -44,6 +44,7 @@ import java.util.Date; import java.util.List; import java.util.function.Consumer; import java.util.function.Function; +import java.util.function.Predicate; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -53,6 +54,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; +import eu.dnetlib.dhp.oa.graph.raw.common.VerifyNsPrefixPredicate; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -113,6 +115,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final String hdfsPath = parser.get("hdfsPath"); log.info("hdfsPath: {}", hdfsPath); + final String nsPrefixBlacklist = parser.get("nsPrefixBlacklist"); + log.info("nsPrefixBlacklist: {}", nsPrefixBlacklist); + + final Predicate verifyNamespacePrefix = new VerifyNsPrefixPredicate(nsPrefixBlacklist); + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); log.info("processClaims: {}", processClaims); @@ -123,23 +130,25 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i smdbe.execute("queryClaims.sql", smdbe::processClaims); } else { log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource, verifyNamespacePrefix); log.info("Processing projects..."); if (dbSchema.equalsIgnoreCase("beta")) { - smdbe.execute("queryProjects.sql", smdbe::processProject); + smdbe.execute("queryProjects.sql", smdbe::processProject, verifyNamespacePrefix); } else { - smdbe.execute("queryProjects_production.sql", smdbe::processProject); + smdbe.execute("queryProjects_production.sql", smdbe::processProject, verifyNamespacePrefix); } log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix); log.info("Processing relationsNoRemoval ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + smdbe + .execute( + "queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization, verifyNamespacePrefix); log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization, verifyNamespacePrefix); } log.info("All done."); } @@ -163,10 +172,20 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i } public void execute(final String sqlFile, final Function> producer) + throws Exception { + execute(sqlFile, producer, oaf -> true); + } + + public void execute(final String sqlFile, final Function> producer, + final Predicate predicate) throws Exception { final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); - final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> { + if (predicate.test(oaf)) { + emitOaf(oaf); + } + }); dbClient.processResults(sql, consumer); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java new file mode 100644 index 0000000000..1e99d298d7 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java @@ -0,0 +1,62 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Splitter; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; + +/** + * This predicate should be used to skip oaf objects using a blacklist of nsprefixes. + * + * @author michele + */ +public class VerifyNsPrefixPredicate implements Predicate { + + final Set invalids = new HashSet<>(); + + public VerifyNsPrefixPredicate(final String blacklist) { + if (StringUtils.isNotBlank(blacklist)) { + Splitter + .on(",") + .trimResults() + .omitEmptyStrings() + .split(blacklist) + .forEach(invalids::add); + } + } + + @Override + public boolean test(final Oaf oaf) { + if (oaf instanceof Datasource) { + return testValue(((Datasource) oaf).getNamespaceprefix().getValue()); + } else if (oaf instanceof OafEntity) { + return testValue(((OafEntity) oaf).getId()); + } else if (oaf instanceof Relation) { + return testValue(((Relation) oaf).getSource()) && testValue(((Relation) oaf).getTarget()); + } else { + return true; + } + } + + protected boolean testValue(final String s) { + if (StringUtils.isNotBlank(s)) { + for (final String invalid : invalids) { + if (Pattern.matches("^(\\d\\d\\|)?" + invalid + ".*$", s)) { + return false; + } + } + } + return true; + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json index 6dfef32db0..b23ac65464 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json @@ -40,5 +40,11 @@ "paramLongName": "dbschema", "paramDescription": "the database schema according to the D-Net infrastructure (beta or production)", "paramRequired": true + }, + { + "paramName": "nsbl", + "paramLongName": "nsPrefixBlacklist", + "paramDescription": "a blacklist of nsprefixes (comma separeted)", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index d8b61b5ead..d8146d9a22 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -43,7 +43,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -131,6 +135,7 @@ --isLookupUrl${isLookupUrl} --actionclaims --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} @@ -182,6 +187,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml index 66eaeeb263..4c319d0378 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml @@ -38,7 +38,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -113,6 +117,7 @@ --isLookupUrl${isLookupUrl} --actionclaims --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml index eea8d0a5ab..29d4269ef8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml @@ -25,7 +25,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -99,6 +103,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} @@ -117,6 +122,7 @@ --isLookupUrl${isLookupUrl} --dbschema${dbSchema} --actionclaims + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml index 8684181526..9b68cfb053 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml @@ -28,6 +28,11 @@ isLookupUrl the address of the lookUp service + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -67,6 +72,7 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} -islookup${isLookupUrl} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java new file mode 100644 index 0000000000..a14fb4ae33 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +class VerifyNsPrefixPredicateTest { + + private VerifyNsPrefixPredicate predicate; + + @BeforeEach + void setUp() throws Exception { + predicate = new VerifyNsPrefixPredicate("corda,nsf,wt"); + } + + @Test + void testTestValue() { + assertFalse(predicate.testValue("corda__2020")); + assertFalse(predicate.testValue("nsf________")); + assertFalse(predicate.testValue("nsf")); + assertFalse(predicate.testValue("corda")); + assertFalse(predicate.testValue("10|corda_______::fjkdsfjksdhfksj")); + assertFalse(predicate.testValue("20|corda_______::fjkdsfjksdhfksj")); + + assertTrue(predicate.testValue("xxxxxx_____")); + assertTrue(predicate.testValue("10|xxxxxx_____::sdasdasaddasad")); + + assertTrue(predicate.testValue(null)); + assertTrue(predicate.testValue("")); + } + + @Test + void testTest_ds_true() { + final Field prefix = new Field<>(); + prefix.setValue("xxxxxx______"); + + final Datasource ds = new Datasource(); + ds.setNamespaceprefix(prefix); + + assertTrue(predicate.test(ds)); + } + + @Test + void testTest_ds_false() { + final Field prefix = new Field<>(); + prefix.setValue("corda__2020"); + + final Datasource ds = new Datasource(); + ds.setNamespaceprefix(prefix); + + assertFalse(predicate.test(ds)); + } + + @Test + void testTest_rel_true() { + final Relation rel = new Relation(); + rel.setSource("10|yyyyyy______:sdfsfsffsdfs"); + rel.setTarget("10|xxxxxx______:sdfsfsffsdfs"); + assertTrue(predicate.test(rel)); + } + + @Test + void testTest_rel_false() { + final Relation rel = new Relation(); + rel.setSource("10|corda_______:sdfsfsffsdfs"); + rel.setTarget("10|xxxxxx______:sdfsfsffsdfs"); + assertFalse(predicate.test(rel)); + } + + @Test + void testTest_proj_true() { + final Project p = new Project(); + p.setId("10|xxxxxx______:sdfsfsffsdfs"); + assertTrue(predicate.test(p)); + } + + @Test + void testTest_proj_false() { + final Project p = new Project(); + p.setId("10|corda_____:sdfsfsffsdfs"); + assertFalse(predicate.test(p)); + } + +} From 8cc067fe768c94f3fb44f999312647c02f8fe0e3 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 3 Aug 2020 11:17:50 +0200 Subject: [PATCH 2/7] specific test for claims --- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 11 ++ .../dhp/oa/graph/raw/oaf_claim_dedup.xml | 182 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_dedup.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index dcb4c49b4c..0a513f6337 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -276,6 +276,17 @@ public class MappersTest { System.out.println("***************"); } + + @Test + void testClaimDedup() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml")); + final List list = new OafToOafMapper(vocs, false).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + } + private void assertValidId(final String id) { assertEquals(49, id.length()); assertEquals('|', id.charAt(2)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_dedup.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_dedup.xml new file mode 100644 index 0000000000..95457fb701 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_dedup.xml @@ -0,0 +1,182 @@ + + + dedup_wf_001::534276867e917fe9efe0cca10e363457 + 2020-08-02T22:55:40.866Z + openaire____ + 2020-08-02T23:53:04.582Z + + + ATLAS (IHEF, IoP, FNWI) + Doğuş Üniversitesi, Fen Edebiyat Fakültesi, Fizik Bölümü + TR3959 + Doğuş Üniversitesi, Fen Edebiyat Fakültesi, Fizik Bölümü + TR3959 + urn:issn:1748-0221 + VOLUME=7;ISSUE=1;ISSN=1748-0221;TITLE=Journal of Instrumentation + ATLAS Collaboration Mitsou, Vasiliki Fiorini, Luca Ros Martínez, Eduardo Castillo + Giménez, María Victoria Fuster Verdú, Juan A. García García, Carmen Cabrera Urbán, + Susana Martí García, Salvador Salt Cairols, José Lacasta Llácer, Carlos Valls Ferrer, + Juan Antonio Higón Rodríguez, Emilio Ferrer Soria, Antonio González de la Hoz, Santiago + Kaci, Mohammed Hernández Jiménez, Yesenia Villaplana Pérez, Miguel 2012 A study of the + material in the ATLAS inner detector using secondary hadronic interactions Journal Of + Instrumentation 7 P01013 1 41 + Journal of Instrumentation, 7(1) + Aad, G; Abbott, B; Abdallah, J; Abdelalim, AA; Abdesselam, A; Abdinov, O;  et + al.(2012). A study of the material in the ATLAS inner detector using secondary hadronic + interactions. Journal of Instrumentation, 7(1). doi: 10.1088/1748-0221/7/01/P01013. UC + Santa Cruz: Retrieved from: http://www.escholarship.org/uc/item/05j2j2br + Journal of Instrumentation, 7 + VOLUME=7;ISSN=1748-0221;TITLE=Journal of Instrumentation + 1748-0221 + Journal of Instrumentation 7, P01013 (2012). + doi:10.1088/1748-0221/7/01/P01013 + A measurement of the material in the ATLAS inner detector using secondary hadronic + interactions + und + Detector modelling and simulations I + (interaction of radiation with matter, interaction of photons with matter, interaction + of hadrons with matter, etc); Particle tracking detectors (Solid-state detectors); Si + microstrip and pad detectors; Large detector systems for particle and astroparticle + physics + of photons with matter, interaction of + hadrons with matter, etc) + Particle Physics - Experiment + Detector Modelling and + Simulations + Detector modelling and simulations I + (interaction of radiation with matter, interaction of photons with matter, interaction + of hadrons with matter, etc) + Large detector systems for particle and + astroparticle physics + Detector modelling and simulations I + (interaction of radiation with matter, interaction + Large Detector Systems + 530 + Science & Technology + :Ciências Físicas [Ciências + Naturais] + High Energy Physics - + Experiment + Detectors de radiació + Física nuclear + ddc:610 + Si microstrip and pad + detectors + Particle tracking detectors (Solid-state + detectors) + Col·lisions (Física nuclear) + Particle Tracking Detectors + IOP Publishing + application/pdf + application/pdf + application/pdf + application/pdf + application/pdf + application/pdf + 2016-05-02 + The ATLAS inner detector is used to reconstruct secondary vertices due to + hadronic interactions of primary collision products, so probing the location and amount + of material in the inner region of ATLAS. Data collected in 7 TeV pp collisions at the + LHC, with a minimum bias trigger, are used for comparisons with simulated events. The + reconstructed secondary vertices have spatial resolutions ranging from ~ 200μm to 1 mm. + The overall material description in the simulation is validated to within an + experimental uncertainty of about 7%. This will lead to a better understanding of the + reconstruction of various objects such as tracks, leptons, jets, and missing transverse + momentum. We acknowledge the support of ANPCyT, Argentina; YerPhI, Armenia; ARC, + Australia; BMWF, Austria; ANAS, Azerbaijan; SSTC, Belarus; CNPq and FAPESP, Brazil; + NSERC, NRC and CFI, Canada; CERN; CONICYT, Chile; CAS, MOST and NSFC, China; + COLCIENCIAS, Colombia; MSMT CR, MPO CR and VSC CR, Czech Republic; DNRF, DNSRC and + Lundbeck Foundation, Denmark; ARTEMIS, European Union; IN2P3-CNRS, CEA-DSM/IRFU, France; + GNAS, Georgia; BMBF, DFG, HGF, MPG and AvH Foundation, Germany; GSRT, Greece; ISF, + MINERVA, GIF, DIP and Benoziyo Center, Israel; INFN, Italy; MEXT and JSPS, Japan; CNRST, + Morocco; FOM and NWO, Netherlands; RCN, Norway; MNiSW, Poland; GRICES and FCT, Portugal; + MERYS (MECTS), Romania; MES of Russia and ROSATOM, Russian Federation; JINR; MSTD, + Serbia; MSSR, Slovakia; ARRS and MVZT, Slovenia; DST/NRF, South Africa; MICINN, Spain; + SRC and Wallenberg Foundation, Sweden; SER, SNSF and Cantons of Bern and Geneva, + Switzerland; NSC, Taiwan; TAEK, Turkey; STFC, the Royal Society and Leverhulme Trust, + United Kingdom; DOE and NSF, United States of America. + info:eu-repo/semantics/publishedVersion + NARCIS + DSpace@Dogus + Lancaster EPrints + CERN Document Server + DESY Publication Database + OpenAIRE + Publikationenserver der Georg-August-Universität Göttingen + arXiv.org e-Print Archive + CORE (RIOXX-UK Aggregator) + eScholarship - University of California + Universidade do Minho: RepositoriUM + Dokuz Eylul University Open Archive System + Repositori d'Objectes Digitals per a l'Ensenyament la Recerca i la + Cultura + info:eu-repo/semantics/altIdentifier/doi/10.1088/1748-0221/7/01/P01013 + info:eu-repo/semantics/altIdentifier/doi/10.1088/1748-0221/7/01/P01013. + Article + http://hdl.handle.net/11376/1605 + Article + http://www.escholarship.org/uc/item/05j2j2br + Unknown + http://cds.cern.ch/record/1394292 + Article + http://eprints.lancs.ac.uk/68235/ + Article + http://hdl.handle.net/10550/36188 + Article + http://eprints.gla.ac.uk/65933/1/65933.pdf + Preprint + http://arxiv.org/abs/1110.6191 + Article + http://dare.uva.nl/personal/pure/en/publications/a-study-of-the-material-in-the-atlas-inner-detector-using-secondary-hadronic-interactions(6b7667e2-04e2-4a66-92a8-ff4edbf61a17).html + Article + http://hdl.handle.net/1822/48768 + Article + http://resolver.sub.uni-goettingen.de/purl?gs-1/12231 + Article + http://bib-pubdb1.desy.de/search?p=id:%22PHPPUBDB-21212%22 + http://bib-pubdb1.desy.de/record/96807/files/CERN-PH-EP-2011-147_1110.6191v2.pdf + http://bib-pubdb1.desy.de//record/96807/files/CERN-PH-EP-2011-147_1110.6191v2.pdf + http://bib-pubdb1.desy.de/record/96807 + Article + http://arxiv.org/abs/1110.6191 + Article + http://hdl.handle.net/11376/1605 + http://dx.doi.org/10.1088/1748-0221/7/01/P01013 + Article + http://hdl.handle.net/2066/93736 + ATLAS Collaboration + 0001 + 0002 + 2016-05-02 + OPEN + und + + + + + + + + + file%3A%2F%2F%2Fsrv%2Fclaims%2Frecords%2Fpublication%2Fopenaire + + + + + + + false + false + 0.9 + + + + + From c35bf486cc87ce8b6acd9ccbf4238fae776b1c9e Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 4 Aug 2020 12:50:12 +0200 Subject: [PATCH 3/7] added handle among the possible PIDs --- .../test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt | 1 + .../src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 05484c8e50..729296522b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -951,6 +951,7 @@ dnet:countries @=@ ZW @=@ ABW dnet:protocols @=@ oai @=@ OAI-PMH dnet:protocols @=@ oai @=@ OAI_PMH dnet:pid_types @=@ orcid @=@ ORCID12 +dnet:pid_types @=@ handle @=@ hdl dnet:review_levels @=@ 0000 @=@ UNKNOWN dnet:review_levels @=@ 0002 @=@ 80 大阪経大学会「Working Paper」 dnet:review_levels @=@ 0002 @=@ AO diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index 59bed7c3a0..93cc00eca4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1045,6 +1045,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ pmid @=@ pmid dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier +dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications From 09a323d18de0d495fcc4f45a8822197709753180 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 4 Aug 2020 12:50:52 +0200 Subject: [PATCH 4/7] testing a dataset from Nakala --- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 23 +++++ .../dnetlib/dhp/oa/graph/raw/odf_nakala.xml | 88 +++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 0a513f6337..50f190a61e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -287,6 +287,29 @@ public class MappersTest { System.out.println("***************"); } + @Test + void testNakala() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_nakala.xml")); + final List list = new OdfToOafMapper(vocs, false).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Dataset); + + final Dataset d = (Dataset) list.get(0); + + assertValidId(d.getId()); + assertValidId(d.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertEquals(1, d.getAuthor().size()); + assertEquals(0, d.getSubject().size()); + assertEquals(1, d.getInstance().size()); + assertEquals(1, d.getPid().size()); + } + private void assertValidId(final String id) { assertEquals(49, id.length()); assertEquals('|', id.charAt(2)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml new file mode 100644 index 0000000000..105d0c413f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_nakala.xml @@ -0,0 +1,88 @@ + + + + r3f5b9831893::cca7367159bc3ff90cd2f75bf9dc21c4 + oai:nakala.fr:hdl_11280_847e01df + 2020-08-01T00:16:24.742Z + r3f5b9831893 + oai:nakala.fr:hdl_11280_847e01df + 2020-06-08T01:01:38Z + hdl_11280_2b09fc10 + hdl_11280_c1bc48d0 + hdl_11280_57c8db3a + 2020-08-01T00:31:35.625Z + + + + 277 + http://hdl.handle.net/11280/847e01df + + http://hdl.handle.net/http://hdl.handle.net/11280/847e01df + + http://nakala.fr/data/11280/847e01df + + + DHAAP + + + + CVP_Notice277-1 place du Docteur Antoine Béclère _PHO02.jpg + + + Hôpital Saint-Antoine. Fragment de dalle funéraire trouvée en décembre 1932. Paris (XIIème arr.). Photographie d'Albert Citerne (1876-1970). Plaque de verre, 1932. Département Histoire de l'Architecture et Archéologie de Paris. + Nfa_1146 + Hôpital Saint-Antoine. Fragment de dalle funéraire trouvée en décembre 1932. Paris (XIIème arr.). Photographie d'Albert Citerne (1876-1970). Plaque de verre, 1932. Département Histoire de l'Architecture et Archéologie de Paris. + + Nakala by Huma-Num + + + DHAAP, Pôle Archéologique + + + + 1932 + + StillImage + + + + + http://hdl.handle.net/11280/847e01df + 0025 + + OPEN + und + + + + + + + https%3A%2F%2Fwww.nakala.fr%2Foai_oa%2F11280%2F8892ab4b + oai:nakala.fr:hdl_11280_847e01df + 2020-06-08T01:01:38Z + + + + + false + false + 0.9 + + + + + \ No newline at end of file From b4e4e5f858dd9250da1d5257249b758791295bf6 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 4 Aug 2020 12:52:14 +0200 Subject: [PATCH 5/7] do not duplicate result PIDs --- .../java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 62f8123bb3..fa0e5221db 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,6 +12,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import com.google.common.collect.Lists; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; @@ -366,7 +367,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List prepareResultPids(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); + final Set res = new HashSet(); res .addAll( prepareListStructPropsWithValidQualifier( @@ -382,7 +383,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']", "@alternateIdentifierType", DNET_PID_TYPES, info)); - return res; + return Lists.newArrayList(res); } } From 01db29e208090adf8228829bdcf2391c0f762fc0 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 4 Aug 2020 12:53:48 +0200 Subject: [PATCH 6/7] fixes redmine issue #5846: datacite and its different namespace declarations --- .../dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5159fa9bb6..7a04e0c3d4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -24,7 +24,9 @@ public abstract class AbstractMdRecordToOafMapper { private final boolean invisible; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; + protected static final String DATACITE_SCHEMA_KERNEL_4_SLASH = "http://datacite.org/schema/kernel-4/"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; + protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; protected static final Qualifier ORCID_PID_TYPE = qualifier( "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Qualifier MAG_PID_TYPE = qualifier( @@ -55,7 +57,7 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); + .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3).replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3).replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); final KeyValue collectedFrom = getProvenanceDatasource( doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); From a29565ff5723b273055748fb1c21d6e06a283883 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 4 Aug 2020 12:55:27 +0200 Subject: [PATCH 7/7] code formatting --- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 6 +++++- .../java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java | 3 ++- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 1 - 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 7a04e0c3d4..5b6ae72f18 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -57,7 +57,11 @@ public abstract class AbstractMdRecordToOafMapper { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); final Document doc = DocumentHelper - .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3).replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3).replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); + .parseText( + xml + .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); final KeyValue collectedFrom = getProvenanceDatasource( doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index fa0e5221db..6fe7bb9713 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,11 +12,12 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; +import com.google.common.collect.Lists; + import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 50f190a61e..2c10f8f580 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -276,7 +276,6 @@ public class MappersTest { System.out.println("***************"); } - @Test void testClaimDedup() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_dedup.xml"));