match terms with vocabularies

This commit is contained in:
Michele Artini 2020-05-27 11:34:13 +02:00
parent c15d997925
commit 3ceb2d2853
11 changed files with 225 additions and 130 deletions

View File

@ -38,6 +38,7 @@ import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
import org.dom4j.Node; import org.dom4j.Node;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
@ -60,7 +61,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMdRecordToOafMapper { public abstract class AbstractMdRecordToOafMapper {
protected final Map<String, String> code2name; protected final VocabularyGroup vocs;
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
@ -84,8 +85,8 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) { protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) {
this.code2name = code2name; this.vocs = vocs;
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
@ -421,14 +422,12 @@ public abstract class AbstractMdRecordToOafMapper {
return null; return null;
} }
protected Qualifier prepareQualifier( protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
final Node node, return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
final String xpath, }
final String schemeId,
final String schemeName) { protected Qualifier prepareQualifier(final String classId, final String schemeId) {
final String classId = node.valueOf(xpath); return vocs.getTermAsQualifier(schemeId, classId);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
@ -436,14 +435,31 @@ public abstract class AbstractMdRecordToOafMapper {
final String xpath, final String xpath,
final String xpathClassId, final String xpathClassId,
final String schemeId, final String schemeId,
final String schemeName,
final DataInfo info) { final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId); final String classId = n.valueOf(xpathClassId).trim();
final String className = code2name.get(classId); res.add(structuredProperty(n.getText(), prepareQualifier(classId, schemeId), info));
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); }
return res;
}
protected List<StructuredProperty> prepareListStructPropsWithValidQualifier(
final Node node,
final String xpath,
final String xpathClassId,
final String schemeId,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId).trim();
if (vocs.termExists(schemeId, classId)) {
res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info));
}
} }
return res; return res;
} }

View File

@ -4,11 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -29,8 +26,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Datasource;
@ -71,25 +68,24 @@ public class GenerateEntitiesApplication {
final String sourcePaths = parser.get("sourcePaths"); final String sourcePaths = parser.get("sourcePaths");
final String targetPath = parser.get("targetPath"); final String targetPath = parser.get("targetPath");
final String dbUrl = parser.get("postgresUrl"); // final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser"); // final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword"); // final String dbPassword = parser.get("postgresPassword");
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
final Map<String, String> code2name = loadVocsFromDB(dbUrl, dbUser, dbPassword); final VocabularyGroup vocs = loadVocsFromIS(isLookupUrl); // MAP: vocId -> voc
code2name.putAll(loadVocsFromIS(isLookupUrl));
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, targetPath); removeOutputDir(spark, targetPath);
generateEntities(spark, code2name, sourcePaths, targetPath); generateEntities(spark, vocs, sourcePaths, targetPath);
}); });
} }
private static void generateEntities( private static void generateEntities(
final SparkSession spark, final SparkSession spark,
final Map<String, String> code2name, final VocabularyGroup vocs,
final String sourcePaths, final String sourcePaths,
final String targetPath) { final String targetPath) {
@ -110,7 +106,7 @@ public class GenerateEntitiesApplication {
sc sc
.sequenceFile(sp, Text.class, Text.class) .sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), code2name)) .map(k -> convertToListOaf(k._1(), k._2(), vocs))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.flatMap(list -> list.iterator())); .flatMap(list -> list.iterator()));
} }
@ -140,14 +136,14 @@ public class GenerateEntitiesApplication {
private static List<Oaf> convertToListOaf( private static List<Oaf> convertToListOaf(
final String id, final String id,
final String s, final String s,
final Map<String, String> code2name) { final VocabularyGroup vocs) {
final String type = StringUtils.substringAfter(id, ":"); final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "native_oaf": case "native_oaf":
return new OafToOafMapper(code2name).processMdRecord(s); return new OafToOafMapper(vocs).processMdRecord(s);
case "native_odf": case "native_odf":
return new OdfToOafMapper(code2name).processMdRecord(s); return new OdfToOafMapper(vocs).processMdRecord(s);
case "datasource": case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class)); return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization": case "organization":
@ -169,32 +165,7 @@ public class GenerateEntitiesApplication {
} }
} }
private static Map<String, String> loadVocsFromDB( private static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final String dbUrl,
final String dbUser,
final String dbPassword) throws IOException {
log.info("Loading vocabulary terms from db...");
final Map<String, String> map = new HashMap<>();
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
dbClient
.processResults("select code, name from class", rs -> {
try {
map.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
log.info("Found " + map.size() + " terms.");
return map;
}
private static Map<String, String> loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String xquery = IOUtils final String xquery = IOUtils
@ -202,16 +173,25 @@ public class GenerateEntitiesApplication {
GenerateEntitiesApplication.class GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
final Map<String, String> map = new HashMap<>(); final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(xquery)) { for (final String s : isLookUpService.quickSearchProfile(xquery)) {
final String[] arr = s.split("@=@"); final String[] arr = s.split("@=@");
if (arr.length == 4) { if (arr.length == 4) {
map.put(arr[2].trim(), arr[3].trim()); final String vocId = arr[0].trim();
final String vocName = arr[1].trim();
final String termId = arr[2].trim();
final String termName = arr[3].trim();
if (!vocs.vocabularyExists(vocId)) {
vocs.addVocabulary(vocId, vocName);
}
vocs.addTerm(vocId, termId, termName);
} }
} }
return map; return vocs;
} }
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) { private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {

View File

@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
@ -13,7 +14,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -24,6 +24,7 @@ import org.dom4j.Node;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
@ -36,8 +37,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final Map<String, String> code2name) { public OafToOafMapper(final VocabularyGroup vocs) {
super(code2name); super(vocs);
} }
@Override @Override
@ -83,7 +84,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected Qualifier prepareLanguages(final Document doc) { protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES, DNET_LANGUAGES); return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES);
} }
@Override @Override
@ -130,14 +131,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final Instance instance = new Instance(); final Instance instance = new Instance();
instance instance
.setInstancetype( .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance instance
@ -300,7 +300,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
return prepareListStructProps( return prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
} }
} }

View File

@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS; import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
@ -21,7 +22,6 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -29,6 +29,7 @@ import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
@ -43,8 +44,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
public OdfToOafMapper(final Map<String, String> code2name) { public OdfToOafMapper(final VocabularyGroup vocs) {
super(code2name); super(vocs);
} }
@Override @Override
@ -120,14 +121,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final Instance instance = new Instance(); final Instance instance = new Instance();
instance instance
.setInstancetype( .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
@ -211,7 +211,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected Qualifier prepareLanguages(final Document doc) { protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES, DNET_LANGUAGES); return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES);
} }
@Override @Override
@ -239,7 +239,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages");
} }
@Override @Override
@ -366,8 +366,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier( return prepareQualifier(
doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE);
DNET_DATA_CITE_RESOURCE);
} }
@Override @Override
@ -375,18 +374,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
res res
.addAll( .addAll(
prepareListStructProps( prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info));
res res
.addAll( .addAll(
prepareListStructProps( prepareListStructPropsWithValidQualifier(
doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info));
"dnet:pid_types", info));
res res
.addAll( .addAll(
prepareListStructProps( prepareListStructPropsWithValidQualifier(
doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']",
"@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); "@alternateIdentifierType", DNET_PID_TYPES, info));
return res; return res;
} }

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
import java.util.HashMap;
import java.util.Map;
public class Vocabulary {
private final String id;
private final String name;
private final Map<String, VocabularyTerm> terms = new HashMap<>();
public Vocabulary(final String id, final String name) {
this.id = id;
this.name = name;
}
public String getId() {
return id;
}
public String getName() {
return name;
}
protected Map<String, VocabularyTerm> getTerms() {
return terms;
}
public VocabularyTerm getTerm(final String id) {
return terms.get(id.toLowerCase());
}
protected void addTerm(final String id, final String name) {
terms.put(id.toLowerCase(), new VocabularyTerm(id, name));
}
protected boolean termExists(final String id) {
return terms.containsKey(id.toLowerCase());
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
import java.util.HashMap;
import java.util.Map;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class VocabularyGroup {
private final Map<String, Vocabulary> vocs = new HashMap<>();
public void addVocabulary(final String id, final String name) {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);
}
}
public VocabularyTerm getTerm(final String vocId, final String id) {
if (termExists(vocId, id)) {
return vocs.get(vocId.toLowerCase()).getTerm(id);
} else {
return new VocabularyTerm(id, id);
}
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (termExists(vocId, id)) {
final Vocabulary v = vocs.get(vocId.toLowerCase());
final VocabularyTerm t = v.getTerm(id);
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());
} else {
return OafMapperUtils.qualifier(id, id, vocId, vocId);
}
}
public boolean termExists(final String vocId, final String id) {
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
}
public boolean vocabularyExists(final String vocId) {
return vocs.containsKey(vocId.toLowerCase());
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
public class VocabularyTerm {
private final String id;
private final String name;
public VocabularyTerm(final String id, final String name) {
this.id = id;
this.name = name;
}
public String getId() {
return id;
}
public String getName() {
return name;
}
}

View File

@ -17,27 +17,9 @@
"paramDescription": "the path of the target file", "paramDescription": "the path of the target file",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
},
{ {
"paramName": "islookup", "paramName": "islookup",
"paramLongName": "isLookupUrl", "paramLongName": "islookup",
"paramDescription": "the url of the ISLookupService", "paramDescription": "the url of the ISLookupService",
"paramRequired": true "paramRequired": true
} }

View File

@ -34,6 +34,10 @@
<name>mongoDb</name> <name>mongoDb</name>
<description>mongo database</description> <description>mongo database</description>
</property> </property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
@ -233,9 +237,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg> <arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg> <arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg> <arg>--islookup</arg><arg>${isLookupUrl}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
</spark> </spark>
<ok to="GenerateGraph_claims"/> <ok to="GenerateGraph_claims"/>
<error to="Kill"/> <error to="Kill"/>
@ -282,9 +284,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records</arg> <arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities</arg> <arg>--targetPath</arg><arg>${workingDir}/entities</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg> <arg>--islookup</arg><arg>${isLookupUrl}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
</spark> </spark>
<ok to="GenerateGraph"/> <ok to="GenerateGraph"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -9,17 +9,10 @@
<description>the temporary path to store entities before dispatching</description> <description>the temporary path to store entities before dispatching</description>
</property> </property>
<property> <property>
<name>postgresURL</name> <name>isLookupUrl</name>
<description>the postgres URL to access to the database</description> <description>the address of the lookUp service</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -62,9 +55,7 @@
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg> <arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg> <arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg> <arg>--islookup</arg><arg>${isLookupUrl}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -9,7 +9,6 @@ import static org.mockito.Mockito.when;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -20,6 +19,8 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
@ -34,18 +35,27 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MappersTest { public class MappersTest {
@Mock @Mock
private Map<String, String> code2name; private VocabularyGroup vocs;
@BeforeEach @BeforeEach
public void setUp() throws Exception { public void setUp() throws Exception {
when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); when(vocs.getTermAsQualifier(anyString(), anyString()))
.thenAnswer(
invocation -> OafMapperUtils
.qualifier(
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
invocation.getArgument(0)));
when(vocs.termExists(anyString(), anyString())).thenReturn(true);
} }
@Test @Test
void testPublication() throws IOException { void testPublication() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(code2name).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Publication); assertTrue(list.get(0) instanceof Publication);
@ -86,6 +96,10 @@ public class MappersTest {
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
assertTrue(p.getPid().size() > 0);
assertEquals(p.getPid().get(0).getValue(), "10.3897/oneeco.2.e13718");
assertEquals(p.getPid().get(0).getQualifier().getClassid(), "doi");
assertNotNull(p.getInstance()); assertNotNull(p.getInstance());
assertTrue(p.getInstance().size() > 0); assertTrue(p.getInstance().size() > 0);
p p
@ -115,6 +129,7 @@ public class MappersTest {
assertTrue(StringUtils.isNotBlank(r1.getRelType())); assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType())); assertTrue(StringUtils.isNotBlank(r2.getRelType()));
// System.out.println(new ObjectMapper().writeValueAsString(p));
// System.out.println(new ObjectMapper().writeValueAsString(r1)); // System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2)); // System.out.println(new ObjectMapper().writeValueAsString(r2));
} }
@ -123,7 +138,7 @@ public class MappersTest {
void testDataset() throws IOException { void testDataset() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Dataset); assertTrue(list.get(0) instanceof Dataset);
@ -205,7 +220,7 @@ public class MappersTest {
void testSoftware() throws IOException { void testSoftware() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
assertEquals(1, list.size()); assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Software); assertTrue(list.get(0) instanceof Software);