From 093f1aff03c21f969b6c30c0d6cf74e74d83e2bc Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:06:55 +0200 Subject: [PATCH 1/7] result pids (new xpaths + IS vocabularies) --- .../raw/AbstractMdRecordToOafMapper.java | 167 ++++++++---------- .../raw/GenerateEntitiesApplication.java | 161 ++++++++++------- .../dhp/oa/graph/raw/OafToOafMapper.java | 33 ++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 54 +++--- .../graph/generate_entities_parameters.json | 7 +- 5 files changed, 217 insertions(+), 205 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5c89d5096f..278544e0b0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,10 +10,27 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; @@ -47,10 +64,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final Qualifier ORCID_PID_TYPE = qualifier( - "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier( - "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -64,8 +79,7 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -79,20 +93,15 @@ public abstract class AbstractMdRecordToOafMapper { .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { - return null; - } + if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { - return null; - } + if (hostedBy == null) { return null; } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -107,9 +116,7 @@ public abstract class AbstractMdRecordToOafMapper { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { - return null; - } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } return keyValue(createOpenaireId(10, dsId, true), dsName); } @@ -125,47 +132,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -194,15 +201,9 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); } } @@ -247,10 +248,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); - r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setPid(prepareResultPids(doc, info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -278,6 +276,8 @@ public abstract class AbstractMdRecordToOafMapper { r.setBestaccessright(getBestAccessRights(instances)); } + protected abstract List prepareResultPids(Document doc, DataInfo info); + private List prepareContexts(final Document doc, final DataInfo info) { final List list = new ArrayList<>(); for (final Object o : doc.selectNodes("//oaf:concept")) { @@ -358,7 +358,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - protected static Qualifier getBestAccessRights(List instanceList) { + protected static Qualifier getBestAccessRights(final List instanceList) { if (instanceList != null) { final Optional min = instanceList .stream() @@ -398,9 +398,7 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); - } + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } } return null; } @@ -453,10 +451,7 @@ public abstract class AbstractMdRecordToOafMapper { for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); + .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); } return res; } @@ -464,9 +459,7 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { - return null; - } + if (n == null) { return null; } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -481,9 +474,7 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { - return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } + if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -495,9 +486,7 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo( - deletedbyinference, inferenceprovenance, inferred, false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 739c7a4629..d184364174 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -5,7 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; import java.sql.SQLException; -import java.util.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -27,7 +32,19 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; public class GenerateEntitiesApplication { @@ -39,14 +56,12 @@ public class GenerateEntitiesApplication { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - MigrateMongoMdstoresApplication.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + .toString(GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional + final Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); @@ -58,17 +73,17 @@ public class GenerateEntitiesApplication { final String dbUrl = parser.get("postgresUrl"); final String dbUser = parser.get("postgresUser"); final String dbPassword = parser.get("postgresPassword"); + final String isLookupUrl = parser.get("isLookupUrl"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final Map code2name = loadVocsFromDB(dbUrl, dbUser, dbPassword); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); + code2name.putAll(loadVocsFromIS(isLookupUrl)); + + final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, code2name, sourcePaths, targetPath); + }); } private static void generateEntities( @@ -77,7 +92,7 @@ public class GenerateEntitiesApplication { final String sourcePaths, final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final List existingSourcePaths = Arrays .stream(sourcePaths.split(",")) .filter(p -> exists(sc, p)) @@ -90,27 +105,25 @@ public class GenerateEntitiesApplication { for (final String sp : existingSourcePaths) { inputRdd = inputRdd - .union( - sc - .sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .filter(Objects::nonNull) - .flatMap(list -> list.iterator())); + .union(sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .filter(Objects::nonNull) + .flatMap(list -> list.iterator())); } inputRdd .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) .reduceByKey((o1, o2) -> merge(o1, o2)) .map(Tuple2::_2) - .map( - oaf -> oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) + .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) .saveAsTextFile(targetPath, GzipCodec.class); } - private static Oaf merge(Oaf o1, Oaf o2) { + private static Oaf merge(final Oaf o1, final Oaf o2) { if (ModelSupport.isSubClass(o1, OafEntity.class)) { ((OafEntity) o1).mergeFrom((OafEntity) o2); } else if (ModelSupport.isSubClass(o1, Relation.class)) { @@ -122,37 +135,41 @@ public class GenerateEntitiesApplication { } private static List convertToListOaf( - final String id, final String s, final Map code2name) { + final String id, + final String s, + final Map code2name) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); } } - private static Map loadClassNames( - final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + private static Map loadVocsFromDB( + final String dbUrl, + final String dbUser, + final String dbPassword) throws IOException { log.info("Loading vocabulary terms from db..."); @@ -160,15 +177,13 @@ public class GenerateEntitiesApplication { try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { dbClient - .processResults( - "select code, name from class", - rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); + .processResults("select code, name from class", rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); } log.info("Found " + map.size() + " terms."); @@ -176,6 +191,24 @@ public class GenerateEntitiesApplication { return map; } + private static Map loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String xquery = + IOUtils.toString(GenerateEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); + + final Map map = new HashMap<>(); + + for (final String s : isLookUpService.quickSearchProfile(xquery)) { + final String[] arr = s.split("@=@"); + if (arr.length == 4) { + map.put(arr[2].trim(), arr[3].trim()); + } + } + + return map; + } + private static Oaf convertFromJson(final String s, final Class clazz) { try { return OBJECT_MAPPER.readValue(s, clazz); @@ -196,7 +229,7 @@ public class GenerateEntitiesApplication { } } - private static void removeOutputDir(SparkSession spark, String path) { + private static void removeOutputDir(final SparkSession spark, final String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index af9fe7197e..61fe08722e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -130,8 +130,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -147,14 +146,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl( - nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl(nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -279,15 +277,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); } } return res; @@ -297,4 +289,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return null; // NOT PRESENT IN OAF } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 9c74c4a938..9026874366 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -120,8 +120,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -170,10 +169,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add( - structuredProperty( - ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, - info)); + .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); } } return res; @@ -225,16 +221,14 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override @@ -260,8 +254,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -335,29 +328,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, - lastUpdateTimestamp)); - } else { - } + .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); + } else {} } } return res; @@ -365,8 +345,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, - DNET_DATA_CITE_RESOURCE); + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + res.addAll(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res.addAll(prepareListStructProps(doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res.addAll(prepareListStructProps(doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); + return res; + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index 293bf041b9..d4d0935a0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -34,6 +34,11 @@ "paramLongName": "postgresPassword", "paramDescription": "postgres password", "paramRequired": false + }, + { + "paramName": "islookup", + "paramLongName": "isLookupUrl", + "paramDescription": "the url of the ISLookupService", + "paramRequired": true } - ] \ No newline at end of file From c6af36496a6b03b497043b735070afaff939ca7d Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:11:09 +0200 Subject: [PATCH 2/7] result pids (new xpaths + IS vocabularies) --- .../eu/dnetlib/dhp/schema/oaf/Result.java | 13 +- .../dhp/oa/dedup/EntityMergerTest.java | 4 +- .../raw/AbstractMdRecordToOafMapper.java | 137 +++++++++++------- .../raw/GenerateEntitiesApplication.java | 75 +++++----- .../dhp/oa/graph/raw/OafToOafMapper.java | 31 ++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 61 ++++++-- 6 files changed, 198 insertions(+), 123 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 213a585a8c..11fdaa4f92 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -244,20 +244,20 @@ public class Result extends OafEntity implements Serializable { subject = mergeLists(subject, r.getSubject()); - //merge title lists: main title with higher trust and distinct between the others + // merge title lists: main title with higher trust and distinct between the others StructuredProperty baseMainTitle = null; - if(title != null) { + if (title != null) { baseMainTitle = getMainTitle(title); title.remove(baseMainTitle); } StructuredProperty newMainTitle = null; - if(r.getTitle() != null) { + if (r.getTitle() != null) { newMainTitle = getMainTitle(r.getTitle()); r.getTitle().remove(newMainTitle); } - if (newMainTitle != null && compareTrust(this, r) < 0 ) + if (newMainTitle != null && compareTrust(this, r) < 0) baseMainTitle = newMainTitle; title = mergeLists(title, r.getTitle()); @@ -314,8 +314,9 @@ public class Result extends OafEntity implements Serializable { } private StructuredProperty getMainTitle(List titles) { - //need to check if the list of titles contains more than 1 main title? (in that case, we should chose which main title select in the list) - for (StructuredProperty title: titles) { + // need to check if the list of titles contains more than 1 main title? (in that case, we should chose which + // main title select in the list) + for (StructuredProperty title : titles) { if (title.getQualifier() != null && title.getQualifier().getClassid() != null) if (title.getQualifier().getClassid().equals("main title")) return title; diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java index f4b2c22522..b8ccb038d7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java @@ -91,9 +91,9 @@ public class EntityMergerTest implements Serializable { assertEquals(pub_merged.getAuthor().size(), 9); assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); - //verify title + // verify title int count = 0; - for (StructuredProperty title: pub_merged.getTitle()){ + for (StructuredProperty title : pub_merged.getTitle()) { if (title.getQualifier().getClassid().equals("main title")) count++; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 278544e0b0..99334dd7f9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,8 +64,10 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier ORCID_PID_TYPE = qualifier( + "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -79,7 +81,8 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -93,15 +96,20 @@ public abstract class AbstractMdRecordToOafMapper { .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { return null; } + if (collectedFrom == null) { + return null; + } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { return null; } + if (hostedBy == null) { + return null; + } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -116,7 +124,9 @@ public abstract class AbstractMdRecordToOafMapper { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { + return null; + } return keyValue(createOpenaireId(10, dsId, true), dsName); } @@ -132,47 +142,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -201,9 +211,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -398,7 +414,9 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } + if (StringUtils.isNotBlank(name)) { + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); + } } return null; } @@ -451,7 +469,10 @@ public abstract class AbstractMdRecordToOafMapper { for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); + .add( + structuredProperty( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); } return res; } @@ -459,7 +480,9 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { return null; } + if (n == null) { + return null; + } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -474,7 +497,9 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } + if (n == null) { + return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -486,7 +511,9 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + return dataInfo( + deletedbyinference, inferenceprovenance, inferred, false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index d184364174..0d140e9adc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -56,8 +56,9 @@ public class GenerateEntitiesApplication { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(GenerateEntitiesApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + .toString( + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); parser.parseArgument(args); @@ -105,21 +106,23 @@ public class GenerateEntitiesApplication { for (final String sp : existingSourcePaths) { inputRdd = inputRdd - .union(sc - .sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .filter(Objects::nonNull) - .flatMap(list -> list.iterator())); + .union( + sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .filter(Objects::nonNull) + .flatMap(list -> list.iterator())); } inputRdd .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) .reduceByKey((o1, o2) -> merge(o1, o2)) .map(Tuple2::_2) - .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) + .map( + oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) .saveAsTextFile(targetPath, GzipCodec.class); } @@ -141,28 +144,28 @@ public class GenerateEntitiesApplication { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); } } @@ -194,8 +197,10 @@ public class GenerateEntitiesApplication { private static Map loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - final String xquery = - IOUtils.toString(GenerateEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); + final String xquery = IOUtils + .toString( + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); final Map map = new HashMap<>(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 61fe08722e..3b8f1170f7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -130,7 +130,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -146,13 +147,14 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl(nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl( + nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -277,9 +279,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); } } return res; @@ -292,6 +300,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List prepareResultPids(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); + return prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 9026874366..f8d9b3cd92 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -120,7 +120,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -169,7 +170,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); + .add( + structuredProperty( + ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, + info)); } } return res; @@ -221,14 +225,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override @@ -254,7 +260,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields( + doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -328,16 +335,29 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, + lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); - } else {} + .add( + getRelation( + otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, + lastUpdateTimestamp)); + } else { + } } } return res; @@ -345,15 +365,28 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); + return prepareQualifier( + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, + DNET_DATA_CITE_RESOURCE); } @Override protected List prepareResultPids(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); - res.addAll(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - res.addAll(prepareListStructProps(doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - res.addAll(prepareListStructProps(doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", + "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", + "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); return res; } From c15d997925722b77262ab8c068d844ef4274de6a Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:13:17 +0200 Subject: [PATCH 3/7] xquery --- .../eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery new file mode 100644 index 0000000000..4938c0aa42 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery @@ -0,0 +1,5 @@ +for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') + let $vocid := $x//VOCABULARY_NAME/@code + let $vocname := $x//VOCABULARY_NAME/text() + for $term in ($x//TERM) + return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name) From 3ceb2d2853d7f6ebcc99ad48fe96373632b3a11b Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 27 May 2020 11:34:13 +0200 Subject: [PATCH 4/7] match terms with vocabularies --- .../raw/AbstractMdRecordToOafMapper.java | 46 ++++++++---- .../raw/GenerateEntitiesApplication.java | 70 +++++++------------ .../dhp/oa/graph/raw/OafToOafMapper.java | 18 ++--- .../dhp/oa/graph/raw/OdfToOafMapper.java | 32 ++++----- .../dhp/oa/graph/raw/common/Vocabulary.java | 42 +++++++++++ .../oa/graph/raw/common/VocabularyGroup.java | 49 +++++++++++++ .../oa/graph/raw/common/VocabularyTerm.java | 22 ++++++ .../graph/generate_entities_parameters.json | 20 +----- .../oa/graph/raw_all/oozie_app/workflow.xml | 12 ++-- .../oa/graph/raw_step2/oozie_app/workflow.xml | 17 ++--- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 27 +++++-- 11 files changed, 225 insertions(+), 130 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 99334dd7f9..ab1e89187a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -38,6 +38,7 @@ import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; @@ -60,7 +61,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; + protected final VocabularyGroup vocs; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; @@ -84,8 +85,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) { + this.vocs = vocs; } public List processMdRecord(final String xml) { @@ -421,14 +422,12 @@ public abstract class AbstractMdRecordToOafMapper { return null; } - protected Qualifier prepareQualifier( - final Node node, - final String xpath, - final String schemeId, - final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { + return prepareQualifier(node.valueOf(xpath).trim(), schemeId); + } + + protected Qualifier prepareQualifier(final String classId, final String schemeId) { + return vocs.getTermAsQualifier(schemeId, classId); } protected List prepareListStructProps( @@ -436,14 +435,31 @@ public abstract class AbstractMdRecordToOafMapper { final String xpath, final String xpathClassId, final String schemeId, - final String schemeName, final DataInfo info) { final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + final String classId = n.valueOf(xpathClassId).trim(); + res.add(structuredProperty(n.getText(), prepareQualifier(classId, schemeId), info)); + } + return res; + } + + protected List prepareListStructPropsWithValidQualifier( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final DataInfo info) { + final List res = new ArrayList<>(); + + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId).trim(); + if (vocs.termExists(schemeId, classId)) { + res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); + } } return res; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 0d140e9adc..3becdec448 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -4,11 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.sql.SQLException; import java.util.Arrays; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; @@ -29,8 +26,8 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Datasource; @@ -71,25 +68,24 @@ public class GenerateEntitiesApplication { final String sourcePaths = parser.get("sourcePaths"); final String targetPath = parser.get("targetPath"); - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); + // final String dbUrl = parser.get("postgresUrl"); + // final String dbUser = parser.get("postgresUser"); + // final String dbPassword = parser.get("postgresPassword"); + final String isLookupUrl = parser.get("isLookupUrl"); - final Map code2name = loadVocsFromDB(dbUrl, dbUser, dbPassword); - - code2name.putAll(loadVocsFromIS(isLookupUrl)); + final VocabularyGroup vocs = loadVocsFromIS(isLookupUrl); // MAP: vocId -> voc final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); + generateEntities(spark, vocs, sourcePaths, targetPath); }); } private static void generateEntities( final SparkSession spark, - final Map code2name, + final VocabularyGroup vocs, final String sourcePaths, final String targetPath) { @@ -110,7 +106,7 @@ public class GenerateEntitiesApplication { sc .sequenceFile(sp, Text.class, Text.class) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .map(k -> convertToListOaf(k._1(), k._2(), vocs)) .filter(Objects::nonNull) .flatMap(list -> list.iterator())); } @@ -140,14 +136,14 @@ public class GenerateEntitiesApplication { private static List convertToListOaf( final String id, final String s, - final Map code2name) { + final VocabularyGroup vocs) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); + return new OafToOafMapper(vocs).processMdRecord(s); case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); + return new OdfToOafMapper(vocs).processMdRecord(s); case "datasource": return Arrays.asList(convertFromJson(s, Datasource.class)); case "organization": @@ -169,32 +165,7 @@ public class GenerateEntitiesApplication { } } - private static Map loadVocsFromDB( - final String dbUrl, - final String dbUser, - final String dbPassword) throws IOException { - - log.info("Loading vocabulary terms from db..."); - - final Map map = new HashMap<>(); - - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - dbClient - .processResults("select code, name from class", rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); - } - - log.info("Found " + map.size() + " terms."); - - return map; - } - - private static Map loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { + private static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); final String xquery = IOUtils @@ -202,16 +173,25 @@ public class GenerateEntitiesApplication { GenerateEntitiesApplication.class .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); - final Map map = new HashMap<>(); + final VocabularyGroup vocs = new VocabularyGroup(); for (final String s : isLookUpService.quickSearchProfile(xquery)) { final String[] arr = s.split("@=@"); if (arr.length == 4) { - map.put(arr[2].trim(), arr[3].trim()); + final String vocId = arr[0].trim(); + final String vocName = arr[1].trim(); + final String termId = arr[2].trim(); + final String termName = arr[3].trim(); + + if (!vocs.vocabularyExists(vocId)) { + vocs.addVocabulary(vocId, vocName); + } + + vocs.addTerm(vocId, termId, termName); } } - return map; + return vocs; } private static Oaf convertFromJson(final String s, final Class clazz) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 3b8f1170f7..50f5b73ac8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; @@ -13,7 +14,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -24,6 +24,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -36,8 +37,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final Map code2name) { - super(code2name); + public OafToOafMapper(final VocabularyGroup vocs) { + super(vocs); } @Override @@ -83,7 +84,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES, DNET_LANGUAGES); + return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES); } @Override @@ -130,14 +131,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance @@ -300,7 +300,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List prepareResultPids(final Document doc, final DataInfo info) { - return prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); + return prepareListStructPropsWithValidQualifier( + doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index f8d9b3cd92..2026927c17 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; @@ -21,7 +22,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; @@ -29,6 +29,7 @@ import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -43,8 +44,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; - public OdfToOafMapper(final Map code2name) { - super(code2name); + public OdfToOafMapper(final VocabularyGroup vocs) { + super(vocs); } @Override @@ -120,14 +121,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); @@ -211,7 +211,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES, DNET_LANGUAGES); + return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES); } @Override @@ -239,7 +239,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages"); } @Override @@ -366,8 +366,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( - doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, - DNET_DATA_CITE_RESOURCE); + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE); } @Override @@ -375,18 +374,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final List res = new ArrayList<>(); res .addAll( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + prepareListStructPropsWithValidQualifier( + doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)); res .addAll( - prepareListStructProps( - doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", - "dnet:pid_types", info)); + prepareListStructPropsWithValidQualifier( + doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info)); res .addAll( - prepareListStructProps( + prepareListStructPropsWithValidQualifier( doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", - "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); + "@alternateIdentifierType", DNET_PID_TYPES, info)); return res; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java new file mode 100644 index 0000000000..7714f6d90c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashMap; +import java.util.Map; + +public class Vocabulary { + + private final String id; + private final String name; + + private final Map terms = new HashMap<>(); + + public Vocabulary(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + + protected Map getTerms() { + return terms; + } + + public VocabularyTerm getTerm(final String id) { + return terms.get(id.toLowerCase()); + } + + protected void addTerm(final String id, final String name) { + terms.put(id.toLowerCase(), new VocabularyTerm(id, name)); + } + + protected boolean termExists(final String id) { + return terms.containsKey(id.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java new file mode 100644 index 0000000000..127f73e220 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -0,0 +1,49 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashMap; +import java.util.Map; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +public class VocabularyGroup { + + private final Map vocs = new HashMap<>(); + + public void addVocabulary(final String id, final String name) { + vocs.put(id.toLowerCase(), new Vocabulary(id, name)); + } + + public void addTerm(final String vocId, final String id, final String name) { + if (vocabularyExists(vocId)) { + vocs.get(vocId.toLowerCase()).addTerm(id, name); + } + } + + public VocabularyTerm getTerm(final String vocId, final String id) { + if (termExists(vocId, id)) { + return vocs.get(vocId.toLowerCase()).getTerm(id); + } else { + return new VocabularyTerm(id, id); + } + } + + public Qualifier getTermAsQualifier(final String vocId, final String id) { + if (termExists(vocId, id)) { + final Vocabulary v = vocs.get(vocId.toLowerCase()); + final VocabularyTerm t = v.getTerm(id); + return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName()); + } else { + return OafMapperUtils.qualifier(id, id, vocId, vocId); + } + } + + public boolean termExists(final String vocId, final String id) { + return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); + } + + public boolean vocabularyExists(final String vocId) { + return vocs.containsKey(vocId.toLowerCase()); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java new file mode 100644 index 0000000000..b3c7859231 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java @@ -0,0 +1,22 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +public class VocabularyTerm { + + private final String id; + private final String name; + + public VocabularyTerm(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index d4d0935a0e..9e3992bcfb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -17,27 +17,9 @@ "paramDescription": "the path of the target file", "paramRequired": true }, - { - "paramName": "pgurl", - "paramLongName": "postgresUrl", - "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", - "paramRequired": true - }, - { - "paramName": "pguser", - "paramLongName": "postgresUser", - "paramDescription": "postgres user", - "paramRequired": false - }, - { - "paramName": "pgpasswd", - "paramLongName": "postgresPassword", - "paramDescription": "postgres password", - "paramRequired": false - }, { "paramName": "islookup", - "paramLongName": "isLookupUrl", + "paramLongName": "islookup", "paramDescription": "the url of the ISLookupService", "paramRequired": true } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index fa015499c9..f8426c35f2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -34,6 +34,10 @@ mongoDb mongo database + + isLookupUrl + the address of the lookUp service + sparkDriverMemory @@ -233,9 +237,7 @@ --sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims --targetPath${workingDir}/entities_claim - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} + --islookup${isLookupUrl} @@ -282,9 +284,7 @@ --sourcePaths${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records --targetPath${workingDir}/entities - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} + --islookup${isLookupUrl} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml index cd0a4025e8..f6485ea9c3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml @@ -9,17 +9,10 @@ the temporary path to store entities before dispatching - postgresURL - the postgres URL to access to the database - - - postgresUser - the user postgres - - - postgresPassword - the password postgres + isLookupUrl + the address of the lookUp service + sparkDriverMemory memory for driver process @@ -62,9 +55,7 @@ -mt yarn-cluster -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records -t${migrationPathStep2}/all_entities - -pgurl${postgresURL} - -pguser${postgresUser} - -pgpasswd${postgresPassword} + --islookup${isLookupUrl} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b9da9fb297..dad427ce4b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -9,7 +9,6 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; -import java.util.Map; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -20,6 +19,8 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; @@ -34,18 +35,27 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MappersTest { @Mock - private Map code2name; + private VocabularyGroup vocs; @BeforeEach public void setUp() throws Exception { - when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); + when(vocs.getTermAsQualifier(anyString(), anyString())) + .thenAnswer( + invocation -> OafMapperUtils + .qualifier( + invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0), + invocation.getArgument(0))); + + when(vocs.termExists(anyString(), anyString())).thenReturn(true); + } @Test void testPublication() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(code2name).processMdRecord(xml); + final List list = new OafToOafMapper(vocs).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Publication); @@ -86,6 +96,10 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); + assertTrue(p.getPid().size() > 0); + assertEquals(p.getPid().get(0).getValue(), "10.3897/oneeco.2.e13718"); + assertEquals(p.getPid().get(0).getQualifier().getClassid(), "doi"); + assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p @@ -115,6 +129,7 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(r1.getRelType())); assertTrue(StringUtils.isNotBlank(r2.getRelType())); + // System.out.println(new ObjectMapper().writeValueAsString(p)); // System.out.println(new ObjectMapper().writeValueAsString(r1)); // System.out.println(new ObjectMapper().writeValueAsString(r2)); } @@ -123,7 +138,7 @@ public class MappersTest { void testDataset() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Dataset); @@ -205,7 +220,7 @@ public class MappersTest { void testSoftware() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs).processMdRecord(xml); assertEquals(1, list.size()); assertTrue(list.get(0) instanceof Software); From 7a7272d9ec0cfe6cbf95b9bf4797b2819db9535b Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:06:55 +0200 Subject: [PATCH 5/7] result pids (new xpaths + IS vocabularies) --- .../raw/AbstractMdRecordToOafMapper.java | 167 ++++++++---------- .../raw/GenerateEntitiesApplication.java | 161 ++++++++++------- .../dhp/oa/graph/raw/OafToOafMapper.java | 33 ++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 54 +++--- .../graph/generate_entities_parameters.json | 7 +- 5 files changed, 217 insertions(+), 205 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5c89d5096f..278544e0b0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,10 +10,27 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; @@ -47,10 +64,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final Qualifier ORCID_PID_TYPE = qualifier( - "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier( - "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -64,8 +79,7 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( - "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -79,20 +93,15 @@ public abstract class AbstractMdRecordToOafMapper { .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { - return null; - } + if (collectedFrom == null) { return null; } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { - return null; - } + if (hostedBy == null) { return null; } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -107,9 +116,7 @@ public abstract class AbstractMdRecordToOafMapper { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { - return null; - } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } return keyValue(createOpenaireId(10, dsId, true), dsName); } @@ -125,47 +132,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -194,15 +201,9 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); } } @@ -247,10 +248,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); - r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setPid(prepareResultPids(doc, info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -278,6 +276,8 @@ public abstract class AbstractMdRecordToOafMapper { r.setBestaccessright(getBestAccessRights(instances)); } + protected abstract List prepareResultPids(Document doc, DataInfo info); + private List prepareContexts(final Document doc, final DataInfo info) { final List list = new ArrayList<>(); for (final Object o : doc.selectNodes("//oaf:concept")) { @@ -358,7 +358,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - protected static Qualifier getBestAccessRights(List instanceList) { + protected static Qualifier getBestAccessRights(final List instanceList) { if (instanceList != null) { final Optional min = instanceList .stream() @@ -398,9 +398,7 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); - } + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } } return null; } @@ -453,10 +451,7 @@ public abstract class AbstractMdRecordToOafMapper { for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add( - structuredProperty( - n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), - n.valueOf("@schemename"), info)); + .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); } return res; } @@ -464,9 +459,7 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { - return null; - } + if (n == null) { return null; } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -481,9 +474,7 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { - return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } + if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -495,9 +486,7 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo( - deletedbyinference, inferenceprovenance, inferred, false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 739c7a4629..d184364174 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -5,7 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; import java.sql.SQLException; -import java.util.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -27,7 +32,19 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; public class GenerateEntitiesApplication { @@ -39,14 +56,12 @@ public class GenerateEntitiesApplication { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString( - MigrateMongoMdstoresApplication.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + .toString(GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional + final Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); @@ -58,17 +73,17 @@ public class GenerateEntitiesApplication { final String dbUrl = parser.get("postgresUrl"); final String dbUser = parser.get("postgresUser"); final String dbPassword = parser.get("postgresPassword"); + final String isLookupUrl = parser.get("isLookupUrl"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final Map code2name = loadVocsFromDB(dbUrl, dbUser, dbPassword); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); + code2name.putAll(loadVocsFromIS(isLookupUrl)); + + final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, code2name, sourcePaths, targetPath); + }); } private static void generateEntities( @@ -77,7 +92,7 @@ public class GenerateEntitiesApplication { final String sourcePaths, final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final List existingSourcePaths = Arrays .stream(sourcePaths.split(",")) .filter(p -> exists(sc, p)) @@ -90,27 +105,25 @@ public class GenerateEntitiesApplication { for (final String sp : existingSourcePaths) { inputRdd = inputRdd - .union( - sc - .sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .filter(Objects::nonNull) - .flatMap(list -> list.iterator())); + .union(sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .filter(Objects::nonNull) + .flatMap(list -> list.iterator())); } inputRdd .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) .reduceByKey((o1, o2) -> merge(o1, o2)) .map(Tuple2::_2) - .map( - oaf -> oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) + .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) .saveAsTextFile(targetPath, GzipCodec.class); } - private static Oaf merge(Oaf o1, Oaf o2) { + private static Oaf merge(final Oaf o1, final Oaf o2) { if (ModelSupport.isSubClass(o1, OafEntity.class)) { ((OafEntity) o1).mergeFrom((OafEntity) o2); } else if (ModelSupport.isSubClass(o1, Relation.class)) { @@ -122,37 +135,41 @@ public class GenerateEntitiesApplication { } private static List convertToListOaf( - final String id, final String s, final Map code2name) { + final String id, + final String s, + final Map code2name) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); } } - private static Map loadClassNames( - final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + private static Map loadVocsFromDB( + final String dbUrl, + final String dbUser, + final String dbPassword) throws IOException { log.info("Loading vocabulary terms from db..."); @@ -160,15 +177,13 @@ public class GenerateEntitiesApplication { try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { dbClient - .processResults( - "select code, name from class", - rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); + .processResults("select code, name from class", rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); } log.info("Found " + map.size() + " terms."); @@ -176,6 +191,24 @@ public class GenerateEntitiesApplication { return map; } + private static Map loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String xquery = + IOUtils.toString(GenerateEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); + + final Map map = new HashMap<>(); + + for (final String s : isLookUpService.quickSearchProfile(xquery)) { + final String[] arr = s.split("@=@"); + if (arr.length == 4) { + map.put(arr[2].trim(), arr[3].trim()); + } + } + + return map; + } + private static Oaf convertFromJson(final String s, final Class clazz) { try { return OBJECT_MAPPER.readValue(s, clazz); @@ -196,7 +229,7 @@ public class GenerateEntitiesApplication { } } - private static void removeOutputDir(SparkSession spark, String path) { + private static void removeOutputDir(final SparkSession spark, final String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 53c0913c23..a0984eed42 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -130,8 +130,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -147,14 +146,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl( - nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl(nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -279,15 +277,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); } } return res; @@ -297,4 +289,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return null; // NOT PRESENT IN OAF } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index b9a1599174..64af8a7512 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -120,8 +120,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -170,10 +169,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add( - structuredProperty( - ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, - info)); + .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); } } return res; @@ -225,16 +221,14 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override @@ -260,8 +254,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { - return prepareListFields( - doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -335,29 +328,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add( - getRelation( - docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, - lastUpdateTimestamp)); + .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); res - .add( - getRelation( - otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, - lastUpdateTimestamp)); - } else { - } + .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); + } else {} } } return res; @@ -365,8 +345,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, - DNET_DATA_CITE_RESOURCE); + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + res.addAll(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res.addAll(prepareListStructProps(doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res.addAll(prepareListStructProps(doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); + return res; + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index 293bf041b9..d4d0935a0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -34,6 +34,11 @@ "paramLongName": "postgresPassword", "paramDescription": "postgres password", "paramRequired": false + }, + { + "paramName": "islookup", + "paramLongName": "isLookupUrl", + "paramDescription": "the url of the ISLookupService", + "paramRequired": true } - ] \ No newline at end of file From a25598140a978fb781f570aaa84db5b957a0caa9 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:11:09 +0200 Subject: [PATCH 6/7] result pids (new xpaths + IS vocabularies) --- .../raw/AbstractMdRecordToOafMapper.java | 137 +++++++++++------- .../raw/GenerateEntitiesApplication.java | 75 +++++----- .../dhp/oa/graph/raw/OafToOafMapper.java | 31 ++-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 61 ++++++-- 4 files changed, 189 insertions(+), 115 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 278544e0b0..99334dd7f9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -64,8 +64,10 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; - protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); - protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier ORCID_PID_TYPE = qualifier( + "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + protected static final Qualifier MAG_PID_TYPE = qualifier( + "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Map nsContext = new HashMap<>(); @@ -79,7 +81,8 @@ public abstract class AbstractMdRecordToOafMapper { nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); } - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected AbstractMdRecordToOafMapper(final Map code2name) { this.code2name = code2name; @@ -93,15 +96,20 @@ public abstract class AbstractMdRecordToOafMapper { .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - if (collectedFrom == null) { return null; } + if (collectedFrom == null) { + return null; + } final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - if (hostedBy == null) { return null; } + if (hostedBy == null) { + return null; + } final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime(); @@ -116,7 +124,9 @@ public abstract class AbstractMdRecordToOafMapper { final String dsId = doc.valueOf(xpathId); final String dsName = doc.valueOf(xpathName); - if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } + if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { + return null; + } return keyValue(createOpenaireId(10, dsId, true), dsName); } @@ -132,47 +142,47 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_DEFAULT_RESULTTYPE); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "": - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(ORP_DEFAULT_RESULTTYPE); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "": + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(ORP_DEFAULT_RESULTTYPE); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; } if (!oafs.isEmpty()) { @@ -201,9 +211,15 @@ public abstract class AbstractMdRecordToOafMapper { final String projectId = createOpenaireId(40, originalId, true); res - .add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, + lastUpdateTimestamp)); } } @@ -398,7 +414,9 @@ public abstract class AbstractMdRecordToOafMapper { final String sp = n.valueOf("@sp"); final String vol = n.valueOf("@vol"); final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } + if (StringUtils.isNotBlank(name)) { + return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); + } } return null; } @@ -451,7 +469,10 @@ public abstract class AbstractMdRecordToOafMapper { for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; res - .add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); + .add( + structuredProperty( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); } return res; } @@ -459,7 +480,9 @@ public abstract class AbstractMdRecordToOafMapper { protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - if (n == null) { return null; } + if (n == null) { + return null; + } final String identifier = n.valueOf("./*[local-name()='identifier']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']"); @@ -474,7 +497,9 @@ public abstract class AbstractMdRecordToOafMapper { protected DataInfo prepareDataInfo(final Document doc) { final Node n = doc.selectSingleNode("//oaf:datainfo"); - if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } + if (n == null) { + return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); @@ -486,7 +511,9 @@ public abstract class AbstractMdRecordToOafMapper { final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final String trust = n.valueOf("./oaf:trust"); - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + return dataInfo( + deletedbyinference, inferenceprovenance, inferred, false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); } protected Field prepareField(final Node node, final String xpath, final DataInfo info) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index d184364174..0d140e9adc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -56,8 +56,9 @@ public class GenerateEntitiesApplication { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(GenerateEntitiesApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + .toString( + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); parser.parseArgument(args); @@ -105,21 +106,23 @@ public class GenerateEntitiesApplication { for (final String sp : existingSourcePaths) { inputRdd = inputRdd - .union(sc - .sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .filter(Objects::nonNull) - .flatMap(list -> list.iterator())); + .union( + sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .filter(Objects::nonNull) + .flatMap(list -> list.iterator())); } inputRdd .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) .reduceByKey((o1, o2) -> merge(o1, o2)) .map(Tuple2::_2) - .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) + .map( + oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) .saveAsTextFile(targetPath, GzipCodec.class); } @@ -141,28 +144,28 @@ public class GenerateEntitiesApplication { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); } } @@ -194,8 +197,10 @@ public class GenerateEntitiesApplication { private static Map loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - final String xquery = - IOUtils.toString(GenerateEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); + final String xquery = IOUtils + .toString( + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); final Map map = new HashMap<>(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index a0984eed42..7231332074 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -130,7 +130,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -146,13 +147,14 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); instance - .setUrl(nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + .setUrl( + nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .distinct() + .collect(Collectors.toCollection(ArrayList::new))); return Lists.newArrayList(instance); } @@ -277,9 +279,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final String otherId = createOpenaireId(50, originalId, false); res - .add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, + lastUpdateTimestamp)); } } return res; @@ -292,6 +300,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List prepareResultPids(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); + return prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 64af8a7512..bd60a658f7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -120,7 +120,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype( + prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); @@ -169,7 +170,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res - .add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); + .add( + structuredProperty( + ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, + info)); } } return res; @@ -221,14 +225,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + return prepareListFields( + doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override @@ -254,7 +260,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + return prepareListFields( + doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @@ -328,16 +335,29 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { if (type.equalsIgnoreCase("IsSupplementTo")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, + lastUpdateTimestamp)); } else if (type.equals("IsPartOf")) { res - .add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); + .add( + getRelation( + docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, + lastUpdateTimestamp)); res - .add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); - } else {} + .add( + getRelation( + otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, + lastUpdateTimestamp)); + } else { + } } } return res; @@ -345,15 +365,28 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); + return prepareQualifier( + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, + DNET_DATA_CITE_RESOURCE); } @Override protected List prepareResultPids(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); - res.addAll(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - res.addAll(prepareListStructProps(doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - res.addAll(prepareListStructProps(doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", + "dnet:pid_types", info)); + res + .addAll( + prepareListStructProps( + doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", + "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); return res; } From b81f2741d24971175f5a2be5f124b54b1bcc5958 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 26 May 2020 13:13:17 +0200 Subject: [PATCH 7/7] xquery --- .../eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery new file mode 100644 index 0000000000..4938c0aa42 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery @@ -0,0 +1,5 @@ +for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') + let $vocid := $x//VOCABULARY_NAME/@code + let $vocname := $x//VOCABULARY_NAME/text() + for $term in ($x//TERM) + return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)