diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5c89d5096f..ab1e89187a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,10 +10,27 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; +import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; +import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; +import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; +import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; @@ -21,6 +38,7 @@ import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; @@ -43,7 +61,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; + protected final VocabularyGroup vocs; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; @@ -67,8 +85,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; + protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) { + this.vocs = vocs; } public List processMdRecord(final String xml) { @@ -247,10 +265,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom)); - r - .setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setPid(prepareResultPids(doc, info)); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES @@ -278,6 +293,8 @@ public abstract class AbstractMdRecordToOafMapper { r.setBestaccessright(getBestAccessRights(instances)); } + protected abstract List prepareResultPids(Document doc, DataInfo info); + private List prepareContexts(final Document doc, final DataInfo info) { final List list = new ArrayList<>(); for (final Object o : doc.selectNodes("//oaf:concept")) { @@ -358,7 +375,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - protected static Qualifier getBestAccessRights(List instanceList) { + protected static Qualifier getBestAccessRights(final List instanceList) { if (instanceList != null) { final Optional min = instanceList .stream() @@ -405,14 +422,12 @@ public abstract class AbstractMdRecordToOafMapper { return null; } - protected Qualifier prepareQualifier( - final Node node, - final String xpath, - final String schemeId, - final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) { + return prepareQualifier(node.valueOf(xpath).trim(), schemeId); + } + + protected Qualifier prepareQualifier(final String classId, final String schemeId) { + return vocs.getTermAsQualifier(schemeId, classId); } protected List prepareListStructProps( @@ -420,14 +435,31 @@ public abstract class AbstractMdRecordToOafMapper { final String xpath, final String xpathClassId, final String schemeId, - final String schemeName, final DataInfo info) { final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + final String classId = n.valueOf(xpathClassId).trim(); + res.add(structuredProperty(n.getText(), prepareQualifier(classId, schemeId), info)); + } + return res; + } + + protected List prepareListStructPropsWithValidQualifier( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final DataInfo info) { + final List res = new ArrayList<>(); + + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId).trim(); + if (vocs.termExists(schemeId, classId)) { + res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info)); + } } return res; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 739c7a4629..3becdec448 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.sql.SQLException; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -24,10 +26,22 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; public class GenerateEntitiesApplication { @@ -40,13 +54,12 @@ public class GenerateEntitiesApplication { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - MigrateMongoMdstoresApplication.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional + final Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); @@ -55,29 +68,28 @@ public class GenerateEntitiesApplication { final String sourcePaths = parser.get("sourcePaths"); final String targetPath = parser.get("targetPath"); - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); + // final String dbUrl = parser.get("postgresUrl"); + // final String dbUser = parser.get("postgresUser"); + // final String dbPassword = parser.get("postgresPassword"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final String isLookupUrl = parser.get("isLookupUrl"); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); + final VocabularyGroup vocs = loadVocsFromIS(isLookupUrl); // MAP: vocId -> voc + + final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, vocs, sourcePaths, targetPath); + }); } private static void generateEntities( final SparkSession spark, - final Map code2name, + final VocabularyGroup vocs, final String sourcePaths, final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final List existingSourcePaths = Arrays .stream(sourcePaths.split(",")) .filter(p -> exists(sc, p)) @@ -94,7 +106,7 @@ public class GenerateEntitiesApplication { sc .sequenceFile(sp, Text.class, Text.class) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .map(k -> convertToListOaf(k._1(), k._2(), vocs)) .filter(Objects::nonNull) .flatMap(list -> list.iterator())); } @@ -110,7 +122,7 @@ public class GenerateEntitiesApplication { .saveAsTextFile(targetPath, GzipCodec.class); } - private static Oaf merge(Oaf o1, Oaf o2) { + private static Oaf merge(final Oaf o1, final Oaf o2) { if (ModelSupport.isSubClass(o1, OafEntity.class)) { ((OafEntity) o1).mergeFrom((OafEntity) o2); } else if (ModelSupport.isSubClass(o1, Relation.class)) { @@ -122,14 +134,16 @@ public class GenerateEntitiesApplication { } private static List convertToListOaf( - final String id, final String s, final Map code2name) { + final String id, + final String s, + final VocabularyGroup vocs) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); + return new OafToOafMapper(vocs).processMdRecord(s); case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); + return new OdfToOafMapper(vocs).processMdRecord(s); case "datasource": return Arrays.asList(convertFromJson(s, Datasource.class)); case "organization": @@ -151,29 +165,33 @@ public class GenerateEntitiesApplication { } } - private static Map loadClassNames( - final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + private static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - log.info("Loading vocabulary terms from db..."); + final String xquery = IOUtils + .toString( + GenerateEntitiesApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); - final Map map = new HashMap<>(); + final VocabularyGroup vocs = new VocabularyGroup(); - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - dbClient - .processResults( - "select code, name from class", - rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); + for (final String s : isLookUpService.quickSearchProfile(xquery)) { + final String[] arr = s.split("@=@"); + if (arr.length == 4) { + final String vocId = arr[0].trim(); + final String vocName = arr[1].trim(); + final String termId = arr[2].trim(); + final String termName = arr[3].trim(); + + if (!vocs.vocabularyExists(vocId)) { + vocs.addVocabulary(vocId, vocName); + } + + vocs.addTerm(vocId, termId, termName); + } } - log.info("Found " + map.size() + " terms."); - - return map; + return vocs; } private static Oaf convertFromJson(final String s, final Class clazz) { @@ -196,7 +214,7 @@ public class GenerateEntitiesApplication { } } - private static void removeOutputDir(SparkSession spark, String path) { + private static void removeOutputDir(final SparkSession spark, final String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 53c0913c23..6f91ce733b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; @@ -13,7 +14,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -24,6 +24,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -36,8 +37,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final Map code2name) { - super(code2name); + public OafToOafMapper(final VocabularyGroup vocs) { + super(vocs); } @Override @@ -83,7 +84,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES, DNET_LANGUAGES); + return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES); } @Override @@ -130,14 +131,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance @@ -297,4 +297,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return null; // NOT PRESENT IN OAF } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + return prepareListStructPropsWithValidQualifier( + doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index b9a1599174..bbd9442e14 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; @@ -21,7 +22,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; @@ -29,6 +29,7 @@ import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -43,8 +44,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; - public OdfToOafMapper(final Map code2name) { - super(code2name); + public OdfToOafMapper(final VocabularyGroup vocs) { + super(vocs); } @Override @@ -120,14 +121,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Instance instance = new Instance(); instance - .setInstancetype( - prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); + .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance - .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); + .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); @@ -211,7 +211,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES, DNET_LANGUAGES); + return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES); } @Override @@ -239,7 +239,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages"); } @Override @@ -366,7 +366,26 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( - doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, - DNET_DATA_CITE_RESOURCE); + doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE); } + + @Override + protected List prepareResultPids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + res + .addAll( + prepareListStructPropsWithValidQualifier( + doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)); + res + .addAll( + prepareListStructPropsWithValidQualifier( + doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info)); + res + .addAll( + prepareListStructPropsWithValidQualifier( + doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", + "@alternateIdentifierType", DNET_PID_TYPES, info)); + return res; + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java new file mode 100644 index 0000000000..7714f6d90c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashMap; +import java.util.Map; + +public class Vocabulary { + + private final String id; + private final String name; + + private final Map terms = new HashMap<>(); + + public Vocabulary(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + + protected Map getTerms() { + return terms; + } + + public VocabularyTerm getTerm(final String id) { + return terms.get(id.toLowerCase()); + } + + protected void addTerm(final String id, final String name) { + terms.put(id.toLowerCase(), new VocabularyTerm(id, name)); + } + + protected boolean termExists(final String id) { + return terms.containsKey(id.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java new file mode 100644 index 0000000000..127f73e220 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -0,0 +1,49 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashMap; +import java.util.Map; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +public class VocabularyGroup { + + private final Map vocs = new HashMap<>(); + + public void addVocabulary(final String id, final String name) { + vocs.put(id.toLowerCase(), new Vocabulary(id, name)); + } + + public void addTerm(final String vocId, final String id, final String name) { + if (vocabularyExists(vocId)) { + vocs.get(vocId.toLowerCase()).addTerm(id, name); + } + } + + public VocabularyTerm getTerm(final String vocId, final String id) { + if (termExists(vocId, id)) { + return vocs.get(vocId.toLowerCase()).getTerm(id); + } else { + return new VocabularyTerm(id, id); + } + } + + public Qualifier getTermAsQualifier(final String vocId, final String id) { + if (termExists(vocId, id)) { + final Vocabulary v = vocs.get(vocId.toLowerCase()); + final VocabularyTerm t = v.getTerm(id); + return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName()); + } else { + return OafMapperUtils.qualifier(id, id, vocId, vocId); + } + } + + public boolean termExists(final String vocId, final String id) { + return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); + } + + public boolean vocabularyExists(final String vocId) { + return vocs.containsKey(vocId.toLowerCase()); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java new file mode 100644 index 0000000000..b3c7859231 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java @@ -0,0 +1,22 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +public class VocabularyTerm { + + private final String id; + private final String name; + + public VocabularyTerm(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index 293bf041b9..9e3992bcfb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -18,22 +18,9 @@ "paramRequired": true }, { - "paramName": "pgurl", - "paramLongName": "postgresUrl", - "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramName": "islookup", + "paramLongName": "islookup", + "paramDescription": "the url of the ISLookupService", "paramRequired": true - }, - { - "paramName": "pguser", - "paramLongName": "postgresUser", - "paramDescription": "postgres user", - "paramRequired": false - }, - { - "paramName": "pgpasswd", - "paramLongName": "postgresPassword", - "paramDescription": "postgres password", - "paramRequired": false } - ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index fa015499c9..f8426c35f2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -34,6 +34,10 @@ mongoDb mongo database + + isLookupUrl + the address of the lookUp service + sparkDriverMemory @@ -233,9 +237,7 @@ --sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims --targetPath${workingDir}/entities_claim - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} + --islookup${isLookupUrl} @@ -282,9 +284,7 @@ --sourcePaths${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records --targetPath${workingDir}/entities - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} + --islookup${isLookupUrl} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml index cd0a4025e8..f6485ea9c3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step2/oozie_app/workflow.xml @@ -9,17 +9,10 @@ the temporary path to store entities before dispatching - postgresURL - the postgres URL to access to the database - - - postgresUser - the user postgres - - - postgresPassword - the password postgres + isLookupUrl + the address of the lookUp service + sparkDriverMemory memory for driver process @@ -62,9 +55,7 @@ -mt yarn-cluster -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records -t${migrationPathStep2}/all_entities - -pgurl${postgresURL} - -pguser${postgresUser} - -pgpasswd${postgresPassword} + --islookup${isLookupUrl} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery new file mode 100644 index 0000000000..4938c0aa42 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery @@ -0,0 +1,5 @@ +for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') + let $vocid := $x//VOCABULARY_NAME/@code + let $vocname := $x//VOCABULARY_NAME/text() + for $term in ($x//TERM) + return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b9da9fb297..dad427ce4b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -9,7 +9,6 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; -import java.util.Map; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -20,6 +19,8 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils; +import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; @@ -34,18 +35,27 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class MappersTest { @Mock - private Map code2name; + private VocabularyGroup vocs; @BeforeEach public void setUp() throws Exception { - when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); + when(vocs.getTermAsQualifier(anyString(), anyString())) + .thenAnswer( + invocation -> OafMapperUtils + .qualifier( + invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0), + invocation.getArgument(0))); + + when(vocs.termExists(anyString(), anyString())).thenReturn(true); + } @Test void testPublication() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(code2name).processMdRecord(xml); + final List list = new OafToOafMapper(vocs).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Publication); @@ -86,6 +96,10 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); + assertTrue(p.getPid().size() > 0); + assertEquals(p.getPid().get(0).getValue(), "10.3897/oneeco.2.e13718"); + assertEquals(p.getPid().get(0).getQualifier().getClassid(), "doi"); + assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p @@ -115,6 +129,7 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(r1.getRelType())); assertTrue(StringUtils.isNotBlank(r2.getRelType())); + // System.out.println(new ObjectMapper().writeValueAsString(p)); // System.out.println(new ObjectMapper().writeValueAsString(r1)); // System.out.println(new ObjectMapper().writeValueAsString(r2)); } @@ -123,7 +138,7 @@ public class MappersTest { void testDataset() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs).processMdRecord(xml); assertEquals(3, list.size()); assertTrue(list.get(0) instanceof Dataset); @@ -205,7 +220,7 @@ public class MappersTest { void testSoftware() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(vocs).processMdRecord(xml); assertEquals(1, list.size()); assertTrue(list.get(0) instanceof Software);