enrichment steps #38
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
|
@ -61,12 +61,6 @@ public class BlackListTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath =
|
|
||||||
* parser.get("outputPath"); log.info("outputPath {}: ", outputPath); final String blacklistPath =
|
|
||||||
* parser.get("hdfsPath"); log.info("blacklistPath {}: ", blacklistPath); final String mergesPath =
|
|
||||||
* parser.get("mergesPath"); log.info("mergesPath {}: ", mergesPath);
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void noRemoveTest() throws Exception {
|
public void noRemoveTest() throws Exception {
|
||||||
SparkRemoveBlacklistedRelationJob
|
SparkRemoveBlacklistedRelationJob
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.bulktag;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -100,6 +101,7 @@ public class SparkBulkTagJob {
|
||||||
|
|
||||||
ResultTagger resultTagger = new ResultTagger();
|
ResultTagger resultTagger = new ResultTagger();
|
||||||
readPath(spark, inputPath, resultClazz)
|
readPath(spark, inputPath, resultClazz)
|
||||||
|
.map(patchResult(), Encoders.bean(resultClazz))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<R, R>) value -> resultTagger
|
(MapFunction<R, R>) value -> resultTagger
|
||||||
.enrichContextCriteria(
|
.enrichContextCriteria(
|
||||||
|
@ -119,4 +121,17 @@ public class SparkBulkTagJob {
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO remove this hack as soon as the values fixed by this method will be provided as NON null
|
||||||
|
private static <R extends Result> MapFunction<R, R> patchResult() {
|
||||||
|
return (MapFunction<R, R>) r -> {
|
||||||
|
if (r.getDataInfo().getDeletedbyinference() == null) {
|
||||||
|
r.getDataInfo().setDeletedbyinference(false);
|
||||||
|
}
|
||||||
|
if (r.getContext() == null) {
|
||||||
|
r.setContext(new ArrayList<>());
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -131,7 +131,7 @@ public class CommunityConfiguration implements Serializable {
|
||||||
p -> {
|
p -> {
|
||||||
if (p.getSnd() == null)
|
if (p.getSnd() == null)
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
|
if (p.getSnd().verifyCriteria(param))
|
||||||
return p.getFst();
|
return p.getFst();
|
||||||
else
|
else
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class VerbResolver implements Serializable {
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
value -> (String) ((ClassInfo) value)
|
value -> (String) value
|
||||||
.getAnnotationInfo()
|
.getAnnotationInfo()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getParameterValues()
|
.getParameterValues()
|
||||||
|
|
|
@ -77,9 +77,15 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
List<String> allowedtypes,
|
List<String> allowedtypes,
|
||||||
String inputPath,
|
String inputPath,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
String whitelisted = "";
|
String whitelisted = " d.id = '" + whitelist.get(0) + "'";
|
||||||
for (String i : whitelist) {
|
for (int i = 1; i < whitelist.size(); i++) {
|
||||||
whitelisted += " OR id = '" + i + "'";
|
whitelisted += " OR d.id = '" + whitelist.get(i) + "'";
|
||||||
|
}
|
||||||
|
|
||||||
|
String allowed = "d.datasourcetype.classid = '" + allowedtypes.get(0) + "'";
|
||||||
|
|
||||||
|
for (int i = 1; i < allowedtypes.size(); i++) {
|
||||||
|
allowed += " OR d.datasourcetype.classid = '" + allowedtypes.get(i) + "'";
|
||||||
}
|
}
|
||||||
|
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||||
|
@ -90,26 +96,39 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
relation.createOrReplaceTempView("relation");
|
relation.createOrReplaceTempView("relation");
|
||||||
organization.createOrReplaceTempView("organization");
|
organization.createOrReplaceTempView("organization");
|
||||||
|
|
||||||
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
// String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||||
+ "FROM ( SELECT id "
|
// + "FROM ( SELECT id "
|
||||||
+ " FROM datasource "
|
// + " FROM datasource "
|
||||||
+ " WHERE (datainfo.deletedbyinference = false "
|
// + " WHERE (datainfo.deletedbyinference = false "
|
||||||
+ whitelisted
|
// + whitelisted
|
||||||
+ ") "
|
// + ") "
|
||||||
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
|
// + getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||||
+ ") d "
|
// + ") d "
|
||||||
+ "JOIN ( SELECT source, target "
|
// + "JOIN ( SELECT source, target "
|
||||||
+ " FROM relation "
|
// + " FROM relation "
|
||||||
+ " WHERE relclass = '"
|
// + " WHERE relclass = '"
|
||||||
+ ModelConstants.IS_PROVIDED_BY
|
// + ModelConstants.IS_PROVIDED_BY
|
||||||
+ "' "
|
// + "' "
|
||||||
+ " AND datainfo.deletedbyinference = false ) rel "
|
// + " AND datainfo.deletedbyinference = false ) rel "
|
||||||
+ "ON d.id = rel.source "
|
// + "ON d.id = rel.source "
|
||||||
+ "JOIN (SELECT id, country "
|
// + "JOIN (SELECT id, country "
|
||||||
+ " FROM organization "
|
// + " FROM organization "
|
||||||
+ " WHERE datainfo.deletedbyinference = false "
|
// + " WHERE datainfo.deletedbyinference = false "
|
||||||
+ " AND length(country.classid) > 0) o "
|
// + " AND length(country.classid) > 0) o "
|
||||||
+ "ON o.id = rel.target";
|
// + "ON o.id = rel.target";
|
||||||
|
|
||||||
|
String query = "SELECT source dataSourceId, " +
|
||||||
|
"named_struct('classid', country.classid, 'classname', country.classname) country " +
|
||||||
|
"FROM datasource d " +
|
||||||
|
"JOIN relation rel " +
|
||||||
|
"ON d.id = rel.source " +
|
||||||
|
"JOIN organization o " +
|
||||||
|
"ON o.id = rel.target " +
|
||||||
|
"WHERE rel.datainfo.deletedbyinference = false " +
|
||||||
|
"and rel.relclass = '" + ModelConstants.IS_PROVIDED_BY + "'" +
|
||||||
|
"and o.datainfo.deletedbyinference = false " +
|
||||||
|
"and length(o.country.classid) > 0 " +
|
||||||
|
"and (" + allowed + " or " + whitelisted + ")";
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.sql(query)
|
.sql(query)
|
||||||
|
|
|
@ -4,7 +4,12 @@ package eu.dnetlib.dhp.countrypropagation;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -13,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCountrySet {
|
public class PrepareResultCountrySet {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||||
|
@ -60,6 +66,7 @@ public class PrepareResultCountrySet {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
removeOutputDir(spark, outputPath);
|
||||||
getPotentialResultToUpdate(
|
getPotentialResultToUpdate(
|
||||||
spark,
|
spark,
|
||||||
inputPath,
|
inputPath,
|
||||||
|
@ -89,10 +96,33 @@ public class PrepareResultCountrySet {
|
||||||
spark
|
spark
|
||||||
.sql(RESULT_COUNTRYSET_QUERY)
|
.sql(RESULT_COUNTRYSET_QUERY)
|
||||||
.as(Encoders.bean(ResultCountrySet.class))
|
.as(Encoders.bean(ResultCountrySet.class))
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.option("compression", "gzip")
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.mode(SaveMode.Append)
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<CountrySbs> countryList = a.getCountrySet();
|
||||||
|
Set<String> countryCodes = countryList
|
||||||
|
.stream()
|
||||||
|
.map(country -> country.getClassid())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
b
|
||||||
|
.getCountrySet()
|
||||||
|
.stream()
|
||||||
|
.forEach(c -> {
|
||||||
|
if (!countryCodes.contains(c.getClassid())) {
|
||||||
|
countryList.add(c);
|
||||||
|
countryCodes.add(c.getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
a.setCountrySet(countryList);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .mode(SaveMode.Append)
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
@ -121,30 +122,39 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
private static void enrichAuthor(Author a, List<AutoritativeAuthor> au) {
|
||||||
|
PacePerson pp = new PacePerson(a.getFullname(), false);
|
||||||
for (AutoritativeAuthor aa : au) {
|
for (AutoritativeAuthor aa : au) {
|
||||||
if (enrichAuthor(aa, a)) {
|
if (enrichAuthor(aa, a, pp.getNormalisedFirstName(), pp.getNormalisedSurname())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) {
|
private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author,
|
||||||
|
String author_name,
|
||||||
|
String author_surname) {
|
||||||
boolean toaddpid = false;
|
boolean toaddpid = false;
|
||||||
|
|
||||||
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getSurname())) {
|
||||||
if (StringUtils.isNotEmpty(author.getSurname())) {
|
if (StringUtils.isNotEmpty(author.getSurname())) {
|
||||||
|
author_surname = author.getSurname();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_surname)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getSurname()
|
.getSurname()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getSurname().trim())) {
|
.equalsIgnoreCase(author_surname.trim())) {
|
||||||
|
|
||||||
// have the same surname. Check the name
|
// have the same surname. Check the name
|
||||||
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
if (StringUtils.isNotEmpty(autoritative_author.getName())) {
|
||||||
if (StringUtils.isNotEmpty(author.getName())) {
|
if (StringUtils.isNotEmpty(author.getName())) {
|
||||||
|
author_name = author.getName();
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotEmpty(author_name)) {
|
||||||
if (autoritative_author
|
if (autoritative_author
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
.equalsIgnoreCase(author.getName().trim())) {
|
.equalsIgnoreCase(author_name.trim())) {
|
||||||
toaddpid = true;
|
toaddpid = true;
|
||||||
}
|
}
|
||||||
// they could be differently written (i.e. only the initials of the name
|
// they could be differently written (i.e. only the initials of the name
|
||||||
|
@ -154,7 +164,7 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
.getName()
|
.getName()
|
||||||
.trim()
|
.trim()
|
||||||
.substring(0, 0)
|
.substring(0, 0)
|
||||||
.equalsIgnoreCase(author.getName().trim().substring(0, 0))) {
|
.equalsIgnoreCase(author_name.trim().substring(0, 0))) {
|
||||||
toaddpid = true;
|
toaddpid = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -105,11 +105,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
(p -> {
|
(p -> {
|
||||||
if (potential_update
|
potential_update.getProjectSet().remove(p);
|
||||||
.getProjectSet()
|
|
||||||
.contains(p)) {
|
|
||||||
potential_update.getProjectSet().remove(p);
|
|
||||||
}
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
String resId = potential_update.getResultId();
|
String resId = potential_update.getResultId();
|
||||||
|
|
|
@ -7,6 +7,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
|
@ -19,6 +20,7 @@ import com.google.gson.Gson;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class PrepareResultCommunitySet {
|
public class PrepareResultCommunitySet {
|
||||||
|
|
||||||
|
@ -93,10 +95,24 @@ public class PrepareResultCommunitySet {
|
||||||
result_organizationset
|
result_organizationset
|
||||||
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
.map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.toJavaRDD()
|
||||||
.mode(SaveMode.Overwrite)
|
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||||
.option("compression", "gzip")
|
.reduceByKey((a, b) -> {
|
||||||
.json(outputPath);
|
ArrayList<String> cl = a.getCommunityList();
|
||||||
|
b.getCommunityList().stream().forEach(s -> {
|
||||||
|
if (!cl.contains(s)) {
|
||||||
|
cl.add(s);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
a.setCommunityList(cl);
|
||||||
|
return a;
|
||||||
|
})
|
||||||
|
.map(value -> OBJECT_MAPPER.writeValueAsString(value._2()))
|
||||||
|
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
private static MapFunction<ResultOrganizations, ResultCommunityList> mapResultCommunityFn(
|
||||||
|
|
|
@ -136,9 +136,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
.stream()
|
.stream()
|
||||||
.forEach(
|
.forEach(
|
||||||
rId -> {
|
rId -> {
|
||||||
if (organization_list.contains(rId)) {
|
organization_list.remove(rId);
|
||||||
organization_list.remove(rId);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
String resultId = potential_update.getResultId();
|
String resultId = potential_update.getResultId();
|
||||||
|
|
|
@ -10,10 +10,27 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
|
@ -21,6 +38,7 @@ import org.dom4j.DocumentFactory;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||||
|
@ -43,7 +61,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public abstract class AbstractMdRecordToOafMapper {
|
public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected final Map<String, String> code2name;
|
protected final VocabularyGroup vocs;
|
||||||
|
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||||
|
@ -67,8 +85,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
|
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
|
||||||
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||||
|
|
||||||
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
|
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) {
|
||||||
this.code2name = code2name;
|
this.vocs = vocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Oaf> processMdRecord(final String xml) {
|
public List<Oaf> processMdRecord(final String xml) {
|
||||||
|
@ -247,10 +265,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
||||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||||
r
|
r.setPid(prepareResultPids(doc, info));
|
||||||
.setPid(
|
|
||||||
prepareListStructProps(
|
|
||||||
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
|
||||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
||||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
||||||
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
|
@ -278,6 +293,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setBestaccessright(getBestAccessRights(instances));
|
r.setBestaccessright(getBestAccessRights(instances));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
|
||||||
|
|
||||||
private List<Context> prepareContexts(final Document doc, final DataInfo info) {
|
private List<Context> prepareContexts(final Document doc, final DataInfo info) {
|
||||||
final List<Context> list = new ArrayList<>();
|
final List<Context> list = new ArrayList<>();
|
||||||
for (final Object o : doc.selectNodes("//oaf:concept")) {
|
for (final Object o : doc.selectNodes("//oaf:concept")) {
|
||||||
|
@ -358,7 +375,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected static Qualifier getBestAccessRights(List<Instance> instanceList) {
|
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||||
if (instanceList != null) {
|
if (instanceList != null) {
|
||||||
final Optional<Qualifier> min = instanceList
|
final Optional<Qualifier> min = instanceList
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -405,14 +422,12 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Qualifier prepareQualifier(
|
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
|
||||||
final Node node,
|
return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
|
||||||
final String xpath,
|
}
|
||||||
final String schemeId,
|
|
||||||
final String schemeName) {
|
protected Qualifier prepareQualifier(final String classId, final String schemeId) {
|
||||||
final String classId = node.valueOf(xpath);
|
return vocs.getTermAsQualifier(schemeId, classId);
|
||||||
final String className = code2name.get(classId);
|
|
||||||
return qualifier(classId, className, schemeId, schemeName);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected List<StructuredProperty> prepareListStructProps(
|
protected List<StructuredProperty> prepareListStructProps(
|
||||||
|
@ -420,14 +435,31 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final String xpath,
|
final String xpath,
|
||||||
final String xpathClassId,
|
final String xpathClassId,
|
||||||
final String schemeId,
|
final String schemeId,
|
||||||
final String schemeName,
|
|
||||||
final DataInfo info) {
|
final DataInfo info) {
|
||||||
final List<StructuredProperty> res = new ArrayList<>();
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
|
||||||
for (final Object o : node.selectNodes(xpath)) {
|
for (final Object o : node.selectNodes(xpath)) {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
final String classId = n.valueOf(xpathClassId);
|
final String classId = n.valueOf(xpathClassId).trim();
|
||||||
final String className = code2name.get(classId);
|
res.add(structuredProperty(n.getText(), prepareQualifier(classId, schemeId), info));
|
||||||
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<StructuredProperty> prepareListStructPropsWithValidQualifier(
|
||||||
|
final Node node,
|
||||||
|
final String xpath,
|
||||||
|
final String xpathClassId,
|
||||||
|
final String schemeId,
|
||||||
|
final DataInfo info) {
|
||||||
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Object o : node.selectNodes(xpath)) {
|
||||||
|
final Node n = (Node) o;
|
||||||
|
final String classId = n.valueOf(xpathClassId).trim();
|
||||||
|
if (vocs.termExists(schemeId, classId)) {
|
||||||
|
res.add(structuredProperty(n.getText(), vocs.getTermAsQualifier(schemeId, classId), info));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.sql.SQLException;
|
import java.util.Arrays;
|
||||||
import java.util.*;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -24,10 +26,22 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.DbClient;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class GenerateEntitiesApplication {
|
public class GenerateEntitiesApplication {
|
||||||
|
@ -40,13 +54,12 @@ public class GenerateEntitiesApplication {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
MigrateMongoMdstoresApplication.class
|
GenerateEntitiesApplication.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json")));
|
||||||
"/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json")));
|
|
||||||
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
|
@ -55,29 +68,28 @@ public class GenerateEntitiesApplication {
|
||||||
final String sourcePaths = parser.get("sourcePaths");
|
final String sourcePaths = parser.get("sourcePaths");
|
||||||
final String targetPath = parser.get("targetPath");
|
final String targetPath = parser.get("targetPath");
|
||||||
|
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
// final String dbUrl = parser.get("postgresUrl");
|
||||||
final String dbUser = parser.get("postgresUser");
|
// final String dbUser = parser.get("postgresUser");
|
||||||
final String dbPassword = parser.get("postgresPassword");
|
// final String dbPassword = parser.get("postgresPassword");
|
||||||
|
|
||||||
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
|
final String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
final VocabularyGroup vocs = loadVocsFromIS(isLookupUrl); // MAP: vocId -> voc
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
final SparkConf conf = new SparkConf();
|
||||||
isSparkSessionManaged,
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
spark -> {
|
removeOutputDir(spark, targetPath);
|
||||||
removeOutputDir(spark, targetPath);
|
generateEntities(spark, vocs, sourcePaths, targetPath);
|
||||||
generateEntities(spark, code2name, sourcePaths, targetPath);
|
});
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void generateEntities(
|
private static void generateEntities(
|
||||||
final SparkSession spark,
|
final SparkSession spark,
|
||||||
final Map<String, String> code2name,
|
final VocabularyGroup vocs,
|
||||||
final String sourcePaths,
|
final String sourcePaths,
|
||||||
final String targetPath) {
|
final String targetPath) {
|
||||||
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
final List<String> existingSourcePaths = Arrays
|
final List<String> existingSourcePaths = Arrays
|
||||||
.stream(sourcePaths.split(","))
|
.stream(sourcePaths.split(","))
|
||||||
.filter(p -> exists(sc, p))
|
.filter(p -> exists(sc, p))
|
||||||
|
@ -94,7 +106,7 @@ public class GenerateEntitiesApplication {
|
||||||
sc
|
sc
|
||||||
.sequenceFile(sp, Text.class, Text.class)
|
.sequenceFile(sp, Text.class, Text.class)
|
||||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||||
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
|
.map(k -> convertToListOaf(k._1(), k._2(), vocs))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.flatMap(list -> list.iterator()));
|
.flatMap(list -> list.iterator()));
|
||||||
}
|
}
|
||||||
|
@ -110,7 +122,7 @@ public class GenerateEntitiesApplication {
|
||||||
.saveAsTextFile(targetPath, GzipCodec.class);
|
.saveAsTextFile(targetPath, GzipCodec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Oaf merge(Oaf o1, Oaf o2) {
|
private static Oaf merge(final Oaf o1, final Oaf o2) {
|
||||||
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
||||||
((OafEntity) o1).mergeFrom((OafEntity) o2);
|
((OafEntity) o1).mergeFrom((OafEntity) o2);
|
||||||
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
|
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
|
||||||
|
@ -122,14 +134,16 @@ public class GenerateEntitiesApplication {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Oaf> convertToListOaf(
|
private static List<Oaf> convertToListOaf(
|
||||||
final String id, final String s, final Map<String, String> code2name) {
|
final String id,
|
||||||
|
final String s,
|
||||||
|
final VocabularyGroup vocs) {
|
||||||
final String type = StringUtils.substringAfter(id, ":");
|
final String type = StringUtils.substringAfter(id, ":");
|
||||||
|
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "native_oaf":
|
case "native_oaf":
|
||||||
return new OafToOafMapper(code2name).processMdRecord(s);
|
return new OafToOafMapper(vocs).processMdRecord(s);
|
||||||
case "native_odf":
|
case "native_odf":
|
||||||
return new OdfToOafMapper(code2name).processMdRecord(s);
|
return new OdfToOafMapper(vocs).processMdRecord(s);
|
||||||
case "datasource":
|
case "datasource":
|
||||||
return Arrays.asList(convertFromJson(s, Datasource.class));
|
return Arrays.asList(convertFromJson(s, Datasource.class));
|
||||||
case "organization":
|
case "organization":
|
||||||
|
@ -151,29 +165,33 @@ public class GenerateEntitiesApplication {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Map<String, String> loadClassNames(
|
private static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
|
||||||
final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||||
|
|
||||||
log.info("Loading vocabulary terms from db...");
|
final String xquery = IOUtils
|
||||||
|
.toString(
|
||||||
|
GenerateEntitiesApplication.class
|
||||||
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
|
||||||
|
|
||||||
final Map<String, String> map = new HashMap<>();
|
final VocabularyGroup vocs = new VocabularyGroup();
|
||||||
|
|
||||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
for (final String s : isLookUpService.quickSearchProfile(xquery)) {
|
||||||
dbClient
|
final String[] arr = s.split("@=@");
|
||||||
.processResults(
|
if (arr.length == 4) {
|
||||||
"select code, name from class",
|
final String vocId = arr[0].trim();
|
||||||
rs -> {
|
final String vocName = arr[1].trim();
|
||||||
try {
|
final String termId = arr[2].trim();
|
||||||
map.put(rs.getString("code"), rs.getString("name"));
|
final String termName = arr[3].trim();
|
||||||
} catch (final SQLException e) {
|
|
||||||
e.printStackTrace();
|
if (!vocs.vocabularyExists(vocId)) {
|
||||||
}
|
vocs.addVocabulary(vocId, vocName);
|
||||||
});
|
}
|
||||||
|
|
||||||
|
vocs.addTerm(vocId, termId, termName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Found " + map.size() + " terms.");
|
return vocs;
|
||||||
|
|
||||||
return map;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
|
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
|
||||||
|
@ -196,7 +214,7 @@ public class GenerateEntitiesApplication {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void removeOutputDir(SparkSession spark, String path) {
|
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
|
||||||
|
@ -13,7 +14,6 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -23,7 +23,8 @@ import org.dom4j.Node;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
@ -36,8 +37,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
public OafToOafMapper(final Map<String, String> code2name) {
|
public OafToOafMapper(final VocabularyGroup vocs) {
|
||||||
super(code2name);
|
super(vocs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -83,7 +84,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareLanguages(final Document doc) {
|
protected Qualifier prepareLanguages(final Document doc) {
|
||||||
return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES, DNET_LANGUAGES);
|
return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -130,14 +131,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
instance
|
instance
|
||||||
.setInstancetype(
|
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
|
||||||
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
|
|
||||||
instance.setCollectedfrom(collectedfrom);
|
instance.setCollectedfrom(collectedfrom);
|
||||||
instance.setHostedby(hostedby);
|
instance.setHostedby(hostedby);
|
||||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||||
instance
|
instance
|
||||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||||
instance
|
instance
|
||||||
|
@ -297,4 +297,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||||
return null; // NOT PRESENT IN OAF
|
return null; // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
||||||
|
return prepareListStructPropsWithValidQualifier(
|
||||||
|
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
|
||||||
|
@ -21,14 +22,14 @@ import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
@ -43,8 +44,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
||||||
|
|
||||||
public OdfToOafMapper(final Map<String, String> code2name) {
|
public OdfToOafMapper(final VocabularyGroup vocs) {
|
||||||
super(code2name);
|
super(vocs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -120,14 +121,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
final Instance instance = new Instance();
|
final Instance instance = new Instance();
|
||||||
instance
|
instance
|
||||||
.setInstancetype(
|
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
|
||||||
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
|
|
||||||
instance.setCollectedfrom(collectedfrom);
|
instance.setCollectedfrom(collectedfrom);
|
||||||
instance.setHostedby(hostedby);
|
instance.setHostedby(hostedby);
|
||||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||||
instance
|
instance
|
||||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES, DNET_ACCESS_MODES));
|
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||||
|
@ -211,7 +211,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareLanguages(final Document doc) {
|
protected Qualifier prepareLanguages(final Document doc) {
|
||||||
return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES, DNET_LANGUAGES);
|
return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -239,7 +239,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||||
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
|
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -366,7 +366,26 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||||
return prepareQualifier(
|
return prepareQualifier(
|
||||||
doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE,
|
doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE);
|
||||||
DNET_DATA_CITE_RESOURCE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
||||||
|
final List<StructuredProperty> res = new ArrayList<>();
|
||||||
|
res
|
||||||
|
.addAll(
|
||||||
|
prepareListStructPropsWithValidQualifier(
|
||||||
|
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info));
|
||||||
|
res
|
||||||
|
.addAll(
|
||||||
|
prepareListStructPropsWithValidQualifier(
|
||||||
|
doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", DNET_PID_TYPES, info));
|
||||||
|
res
|
||||||
|
.addAll(
|
||||||
|
prepareListStructPropsWithValidQualifier(
|
||||||
|
doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']",
|
||||||
|
"@alternateIdentifierType", DNET_PID_TYPES, info));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class Vocabulary {
|
||||||
|
|
||||||
|
private final String id;
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
private final Map<String, VocabularyTerm> terms = new HashMap<>();
|
||||||
|
|
||||||
|
public Vocabulary(final String id, final String name) {
|
||||||
|
this.id = id;
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Map<String, VocabularyTerm> getTerms() {
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VocabularyTerm getTerm(final String id) {
|
||||||
|
return terms.get(id.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addTerm(final String id, final String name) {
|
||||||
|
terms.put(id.toLowerCase(), new VocabularyTerm(id, name));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean termExists(final String id) {
|
||||||
|
return terms.containsKey(id.toLowerCase());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
|
public class VocabularyGroup {
|
||||||
|
|
||||||
|
private final Map<String, Vocabulary> vocs = new HashMap<>();
|
||||||
|
|
||||||
|
public void addVocabulary(final String id, final String name) {
|
||||||
|
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addTerm(final String vocId, final String id, final String name) {
|
||||||
|
if (vocabularyExists(vocId)) {
|
||||||
|
vocs.get(vocId.toLowerCase()).addTerm(id, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public VocabularyTerm getTerm(final String vocId, final String id) {
|
||||||
|
if (termExists(vocId, id)) {
|
||||||
|
return vocs.get(vocId.toLowerCase()).getTerm(id);
|
||||||
|
} else {
|
||||||
|
return new VocabularyTerm(id, id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Qualifier getTermAsQualifier(final String vocId, final String id) {
|
||||||
|
if (termExists(vocId, id)) {
|
||||||
|
final Vocabulary v = vocs.get(vocId.toLowerCase());
|
||||||
|
final VocabularyTerm t = v.getTerm(id);
|
||||||
|
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());
|
||||||
|
} else {
|
||||||
|
return OafMapperUtils.qualifier(id, id, vocId, vocId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean termExists(final String vocId, final String id) {
|
||||||
|
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean vocabularyExists(final String vocId) {
|
||||||
|
return vocs.containsKey(vocId.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
|
|
||||||
|
public class VocabularyTerm {
|
||||||
|
|
||||||
|
private final String id;
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
public VocabularyTerm(final String id, final String name) {
|
||||||
|
this.id = id;
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -18,22 +18,9 @@
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "pgurl",
|
"paramName": "islookup",
|
||||||
"paramLongName": "postgresUrl",
|
"paramLongName": "islookup",
|
||||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
"paramDescription": "the url of the ISLookupService",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pguser",
|
|
||||||
"paramLongName": "postgresUser",
|
|
||||||
"paramDescription": "postgres user",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pgpasswd",
|
|
||||||
"paramLongName": "postgresPassword",
|
|
||||||
"paramDescription": "postgres password",
|
|
||||||
"paramRequired": false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
|
@ -34,6 +34,10 @@
|
||||||
<name>mongoDb</name>
|
<name>mongoDb</name>
|
||||||
<description>mongo database</description>
|
<description>mongo database</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>isLookupUrl</name>
|
||||||
|
<description>the address of the lookUp service</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
|
@ -233,9 +237,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
|
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
|
||||||
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
|
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
|
||||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
<arg>--islookup</arg><arg>${isLookupUrl}</arg>
|
||||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="GenerateGraph_claims"/>
|
<ok to="GenerateGraph_claims"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -282,9 +284,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records</arg>
|
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records</arg>
|
||||||
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
|
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
|
||||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
<arg>--islookup</arg><arg>${isLookupUrl}</arg>
|
||||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="GenerateGraph"/>
|
<ok to="GenerateGraph"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -9,17 +9,10 @@
|
||||||
<description>the temporary path to store entities before dispatching</description>
|
<description>the temporary path to store entities before dispatching</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>postgresURL</name>
|
<name>isLookupUrl</name>
|
||||||
<description>the postgres URL to access to the database</description>
|
<description>the address of the lookUp service</description>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>postgresUser</name>
|
|
||||||
<description>the user postgres</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>postgresPassword</name>
|
|
||||||
<description>the password postgres</description>
|
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -62,9 +55,7 @@
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
|
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
|
||||||
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
|
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
<arg>--islookup</arg><arg>${isLookupUrl}</arg>
|
||||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')
|
||||||
|
let $vocid := $x//VOCABULARY_NAME/@code
|
||||||
|
let $vocname := $x//VOCABULARY_NAME/text()
|
||||||
|
for $term in ($x//TERM)
|
||||||
|
return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)
|
|
@ -9,7 +9,6 @@ import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -20,6 +19,8 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
@ -34,18 +35,27 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
public class MappersTest {
|
public class MappersTest {
|
||||||
|
|
||||||
@Mock
|
@Mock
|
||||||
private Map<String, String> code2name;
|
private VocabularyGroup vocs;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0));
|
when(vocs.getTermAsQualifier(anyString(), anyString()))
|
||||||
|
.thenAnswer(
|
||||||
|
invocation -> OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
|
||||||
|
invocation.getArgument(0)));
|
||||||
|
|
||||||
|
when(vocs.termExists(anyString(), anyString())).thenReturn(true);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testPublication() throws IOException {
|
void testPublication() throws IOException {
|
||||||
|
|
||||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
||||||
|
|
||||||
final List<Oaf> list = new OafToOafMapper(code2name).processMdRecord(xml);
|
final List<Oaf> list = new OafToOafMapper(vocs).processMdRecord(xml);
|
||||||
|
|
||||||
assertEquals(3, list.size());
|
assertEquals(3, list.size());
|
||||||
assertTrue(list.get(0) instanceof Publication);
|
assertTrue(list.get(0) instanceof Publication);
|
||||||
|
@ -86,6 +96,10 @@ public class MappersTest {
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
|
||||||
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
|
||||||
|
|
||||||
|
assertTrue(p.getPid().size() > 0);
|
||||||
|
assertEquals(p.getPid().get(0).getValue(), "10.3897/oneeco.2.e13718");
|
||||||
|
assertEquals(p.getPid().get(0).getQualifier().getClassid(), "doi");
|
||||||
|
|
||||||
assertNotNull(p.getInstance());
|
assertNotNull(p.getInstance());
|
||||||
assertTrue(p.getInstance().size() > 0);
|
assertTrue(p.getInstance().size() > 0);
|
||||||
p
|
p
|
||||||
|
@ -115,6 +129,7 @@ public class MappersTest {
|
||||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||||
|
|
||||||
|
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
||||||
}
|
}
|
||||||
|
@ -123,7 +138,7 @@ public class MappersTest {
|
||||||
void testDataset() throws IOException {
|
void testDataset() throws IOException {
|
||||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
|
||||||
|
|
||||||
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml);
|
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
|
||||||
|
|
||||||
assertEquals(3, list.size());
|
assertEquals(3, list.size());
|
||||||
assertTrue(list.get(0) instanceof Dataset);
|
assertTrue(list.get(0) instanceof Dataset);
|
||||||
|
@ -205,7 +220,7 @@ public class MappersTest {
|
||||||
void testSoftware() throws IOException {
|
void testSoftware() throws IOException {
|
||||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
|
||||||
|
|
||||||
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml);
|
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
|
||||||
|
|
||||||
assertEquals(1, list.size());
|
assertEquals(1, list.size());
|
||||||
assertTrue(list.get(0) instanceof Software);
|
assertTrue(list.get(0) instanceof Software);
|
||||||
|
|
|
@ -769,7 +769,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getLogourl() != null) {
|
if (o.getLogourl() != null) {
|
||||||
metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue()));
|
metadata.add(XmlSerializationUtils.asXmlElement("logourl", o.getLogourl().getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (o.getEclegalbody() != null) {
|
if (o.getEclegalbody() != null) {
|
||||||
|
@ -801,13 +801,13 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.asXmlElement(
|
.asXmlElement(
|
||||||
"echighereducation", o.getEchighereducation().getValue()));
|
"echighereducation", o.getEchighereducation().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getEcinternationalorganization() != null) {
|
if (o.getEcinternationalorganizationeurinterests() != null) {
|
||||||
metadata
|
metadata
|
||||||
.add(
|
.add(
|
||||||
XmlSerializationUtils
|
XmlSerializationUtils
|
||||||
.asXmlElement(
|
.asXmlElement(
|
||||||
"ecinternationalorganizationeurinterests",
|
"ecinternationalorganizationeurinterests",
|
||||||
o.getEcinternationalorganization().getValue()));
|
o.getEcinternationalorganizationeurinterests().getValue()));
|
||||||
}
|
}
|
||||||
if (o.getEcinternationalorganization() != null) {
|
if (o.getEcinternationalorganization() != null) {
|
||||||
metadata
|
metadata
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
<$name$$if(hasId)$ objidentifier="$id$"$else$$endif$>
|
<$name$$if(hasId)$ objidentifier="$id$"$else$$endif$>
|
||||||
$metadata:{$it$}$
|
$metadata:{ it | $it$ }$
|
||||||
</$name$>
|
</$name$>
|
|
@ -0,0 +1,47 @@
|
||||||
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||||
|
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.io.SAXReader;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
public class XmlRecordFactoryTest {
|
||||||
|
|
||||||
|
private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testXMLRecordFactory() throws IOException, DocumentException {
|
||||||
|
|
||||||
|
String json = IOUtils.toString(getClass().getResourceAsStream("joined_entity.json"));
|
||||||
|
|
||||||
|
assertNotNull(json);
|
||||||
|
JoinedEntity je = new ObjectMapper().readValue(json, JoinedEntity.class);
|
||||||
|
assertNotNull(je);
|
||||||
|
|
||||||
|
ContextMapper contextMapper = new ContextMapper();
|
||||||
|
|
||||||
|
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, otherDsTypeId);
|
||||||
|
|
||||||
|
String xml = xmlRecordFactory.build(je);
|
||||||
|
|
||||||
|
assertNotNull(xml);
|
||||||
|
|
||||||
|
Document doc = new SAXReader().read(new StringReader(xml));
|
||||||
|
|
||||||
|
assertNotNull(doc);
|
||||||
|
|
||||||
|
System.out.println(doc.asXML());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue