1
0
Fork 0

merged from master

This commit is contained in:
Claudio Atzori 2020-11-19 14:34:54 +01:00
commit 3f34757c63
52 changed files with 1644 additions and 1528 deletions

View File

@ -15,12 +15,12 @@
<snapshotRepository>
<id>dnet45-snapshots</id>
<name>DNet45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
</snapshotRepository>
<repository>
<id>dnet45-releases</id>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
</distributionManagement>

View File

@ -104,11 +104,6 @@
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -1,9 +1,7 @@
package eu.dnetlib.dhp.schema.oaf;
package eu.dnetlib.dhp.oa.graph.clean;
import java.util.LinkedHashMap;
import java.util.Objects;
import java.util.Optional;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
@ -12,12 +10,19 @@ import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NONE = "none";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
static {
PID_BLACKLIST.add("none");
PID_BLACKLIST.add("na");
}
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
@ -71,7 +76,7 @@ public class CleaningFunctions {
return value;
}
public static <T extends Oaf> T fixDefaults(T value) {
protected static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@ -114,7 +119,7 @@ public class CleaningFunctions {
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)

View File

@ -3,6 +3,10 @@ package eu.dnetlib.dhp.schema.oaf;
public class ModelHardLimits {
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";
public static final String SEPARATOR = "-";
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
@ -11,4 +15,8 @@ public class ModelHardLimits {
public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10;
public static String getCollectionName(String format) {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
}

View File

@ -2,7 +2,6 @@
package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
@ -13,10 +12,43 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
public static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) {
return mergeResults((Result) o1, (Result) o2);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else {
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
}
return o1;
}
public static Result mergeResults(Result r1, Result r2) {
if (new ResultTypeComparator().compare(r1, r2) < 0) {
r1.mergeFrom(r2);
return r1;
} else {
r2.mergeFrom(r1);
return r2;
}
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);

View File

@ -3,12 +3,10 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;

View File

@ -5,6 +5,7 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@ -15,9 +16,15 @@ import org.apache.commons.codec.binary.Hex;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import scala.collection.JavaConverters;
import scala.collection.Seq;
public class DHPUtils {
public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");

View File

@ -1,4 +1,4 @@
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="update broker notifications" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>

View File

@ -6,7 +6,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -20,7 +22,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -68,12 +71,12 @@ public class CleanGraphSparkJob {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
cleanGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
});
}
private static <T extends Oaf> void fixGraphTable(
private static <T extends Oaf> void cleanGraphTable(
SparkSession spark,
VocabularyGroup vocs,
String inputPath,
@ -99,13 +102,15 @@ public class CleanGraphSparkJob {
return spark
.read()
.textFile(inputEntityPath)
.filter((FilterFunction<String>) s -> isEntityType(s, clazz))
.map((MapFunction<String, String>) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING())
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
Encoders.bean(clazz));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
private static <T extends Oaf> boolean isEntityType(final String s, final Class<T> clazz) {
return StringUtils.substringBefore(s, "|").equals(clazz.getName());
}
}

View File

@ -0,0 +1,206 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
/**
* Groups the graph content by entity identifier to ensure ID uniqueness
*/
public class GroupEntitiesAndRelationsSparkJob {
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesAndRelationsSparkJob.class);
private final static String ID_JPATH = "$.id";
private final static String SOURCE_JPATH = "$.source";
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
GroupEntitiesAndRelationsSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/group_graph_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String graphInputPath = parser.get("graphInputPath");
log.info("graphInputPath: {}", graphInputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
groupEntitiesAndRelations(spark, graphInputPath, outputPath);
});
}
private static void groupEntitiesAndRelations(
SparkSession spark,
String inputPath,
String outputPath) {
TypedColumn<Oaf, Oaf> aggregator = new GroupingAggregator().toColumn();
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
spark
.read()
.textFile(toSeq(listPaths(inputPath, sc)))
.map((MapFunction<String, Oaf>) s -> parseOaf(s), Encoders.kryo(Oaf.class))
.filter((FilterFunction<Oaf>) oaf -> StringUtils.isNotBlank(ModelSupport.idFn().apply(oaf)))
.groupByKey((MapFunction<Oaf, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
.agg(aggregator)
.map(
(MapFunction<Tuple2<String, Oaf>, String>) t -> t._2().getClass().getName() +
"|" + OBJECT_MAPPER.writeValueAsString(t._2()),
Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(outputPath);
}
public static class GroupingAggregator extends Aggregator<Oaf, Oaf, Oaf> {
@Override
public Oaf zero() {
return null;
}
@Override
public Oaf reduce(Oaf b, Oaf a) {
return mergeAndGet(b, a);
}
private Oaf mergeAndGet(Oaf b, Oaf a) {
if (Objects.nonNull(a) && Objects.nonNull(b)) {
return OafMapperUtils.merge(b, a);
}
return Objects.isNull(a) ? b : a;
}
@Override
public Oaf merge(Oaf b, Oaf a) {
return mergeAndGet(b, a);
}
@Override
public Oaf finish(Oaf j) {
return j;
}
@Override
public Encoder<Oaf> bufferEncoder() {
return Encoders.kryo(Oaf.class);
}
@Override
public Encoder<Oaf> outputEncoder() {
return Encoders.kryo(Oaf.class);
}
}
private static Oaf parseOaf(String s) {
DocumentContext dc = JsonPath
.parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS));
final String id = dc.read(ID_JPATH);
if (StringUtils.isNotBlank(id)) {
String prefix = StringUtils.substringBefore(id, "|");
switch (prefix) {
case "10":
return parse(s, Datasource.class);
case "20":
return parse(s, Organization.class);
case "40":
return parse(s, Project.class);
case "50":
String resultType = dc.read("$.resulttype.classid");
switch (resultType) {
case "publication":
return parse(s, Publication.class);
case "dataset":
return parse(s, eu.dnetlib.dhp.schema.oaf.Dataset.class);
case "software":
return parse(s, Software.class);
case "other":
return parse(s, OtherResearchProduct.class);
default:
throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType));
}
default:
throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix));
}
} else {
String source = dc.read(SOURCE_JPATH);
if (StringUtils.isNotBlank(source)) {
return parse(s, Relation.class);
} else {
throw new IllegalArgumentException(String.format("invalid oaf: '%s'", s));
}
}
}
private static <T extends Oaf> Oaf parse(String s, Class<T> clazz) {
try {
return OBJECT_MAPPER.readValue(s, clazz);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static List<String> listPaths(String inputPath, JavaSparkContext sc) {
return HdfsSupport
.listFiles(inputPath, sc.hadoopConfiguration())
.stream()
.collect(Collectors.toList());
}
}

View File

@ -33,9 +33,9 @@ import scala.Tuple2;
* are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
* by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
*/
public class MergeGraphSparkJob {
public class MergeGraphTableSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
private static final Logger log = LoggerFactory.getLogger(MergeGraphTableSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

View File

@ -258,8 +258,8 @@ public abstract class AbstractMdRecordToOafMapper {
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareResultPids(doc, info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));

View File

@ -4,9 +4,11 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.*;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@ -18,7 +20,6 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -68,7 +69,7 @@ public class GenerateEntitiesApplication {
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, targetPath);
HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration());
generateEntities(spark, vocs, sourcePaths, targetPath);
});
}
@ -82,7 +83,7 @@ public class GenerateEntitiesApplication {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final List<String> existingSourcePaths = Arrays
.stream(sourcePaths.split(","))
.filter(p -> exists(sc, p))
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
.collect(Collectors.toList());
log.info("Generate entities from files:");
@ -103,7 +104,7 @@ public class GenerateEntitiesApplication {
inputRdd
.mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
.reduceByKey((o1, o2) -> merge(o1, o2))
.reduceByKey((o1, o2) -> OafMapperUtils.merge(o1, o2))
.map(Tuple2::_2)
.map(
oaf -> oaf.getClass().getSimpleName().toLowerCase()
@ -112,38 +113,6 @@ public class GenerateEntitiesApplication {
.saveAsTextFile(targetPath, GzipCodec.class);
}
private static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) {
return mergeResults((Result) o1, (Result) o2);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else {
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
}
return o1;
}
protected static Result mergeResults(Result r1, Result r2) {
if (new ResultTypeComparator().compare(r1, r2) < 0) {
r1.mergeFrom(r2);
return r1;
} else {
r2.mergeFrom(r1);
return r2;
}
}
private static List<Oaf> convertToListOaf(
final String id,
final String s,
@ -192,17 +161,4 @@ public class GenerateEntitiesApplication {
}
}
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
try {
final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration());
final Path path = new Path(pathToFile);
return hdfs.exists(path);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
@ -9,25 +10,20 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.io.Closeable;
import java.io.IOException;
@ -442,26 +438,22 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
try {
final String targetType = rs.getString(TARGET_TYPE);
if (rs.getString(SOURCE_TYPE).equals("context")) {
final Result r;
switch (targetType) {
case "dataset":
r = new Dataset();
break;
case "software":
r = new Software();
break;
case "other":
r = new OtherResearchProduct();
break;
case "publication":
default:
r = new Publication();
break;
if (rs.getString(TARGET_TYPE).equals("dataset")) {
r = new Dataset();
r.setResulttype(DATASET_DEFAULT_RESULTTYPE);
} else if (rs.getString(TARGET_TYPE).equals("software")) {
r = new Software();
r.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
} else if (rs.getString(TARGET_TYPE).equals("other")) {
r = new OtherResearchProduct();
r.setResulttype(ORP_DEFAULT_RESULTTYPE);
} else {
r = new Publication();
r.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
}
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info));
@ -471,7 +463,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return Arrays.asList(r);
} else {
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false);
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
final Relation r1 = new Relation();
final Relation r2 = new Relation();
@ -527,9 +519,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
final String inferenceprovenance = rs.getString("inferenceprovenance");
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
final double trust = rs.getDouble("trust");
return dataInfo(
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION,
String.format("%.3f", trust));
}
private Qualifier prepareQualifierSplitting(final String s) {

View File

@ -2,9 +2,7 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.ArrayList;
import java.util.List;
@ -18,9 +16,9 @@ import org.dom4j.Node;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {

View File

@ -2,15 +2,9 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
@ -18,9 +12,9 @@ import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {

View File

@ -50,12 +50,36 @@
</property>
</parameters>
<start to="fork_clean_graph"/>
<start to="group_entities"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="group_entities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>group graph entities and relations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.GroupEntitiesAndRelationsSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--graphInputPath</arg><arg>${graphInputPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/grouped_entities</arg>
</spark>
<ok to="fork_clean_graph"/>
<error to="Kill"/>
</action>
<fork name="fork_clean_graph">
<path start="clean_publication"/>
<path start="clean_dataset"/>
@ -84,7 +108,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -110,7 +134,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -136,7 +160,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -162,7 +186,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -188,7 +212,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -214,7 +238,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -240,7 +264,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
@ -266,7 +290,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
<arg>--inputPath</arg><arg>${workingDir}/grouped_entities</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "gin",
"paramLongName": "graphInputPath",
"paramDescription": "the graph root path",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output merged graph root path",
"paramRequired": true
}
]

View File

@ -2,11 +2,11 @@
<parameters>
<property>
<name>betaInputGgraphPath</name>
<name>betaInputGraphPath</name>
<description>the beta graph root path</description>
</property>
<property>
<name>prodInputGgraphPath</name>
<name>prodInputGraphPath</name>
<description>the production graph root path</description>
</property>
<property>
@ -76,7 +76,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge publications</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -88,8 +88,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/publication</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/publication</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -103,7 +103,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge datasets</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -115,8 +115,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/dataset</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/dataset</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -130,7 +130,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge otherresearchproducts</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -142,8 +142,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/otherresearchproduct</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -157,7 +157,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge softwares</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -169,8 +169,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/software</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/software</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -184,7 +184,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge datasources</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -196,8 +196,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/datasource</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/datasource</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -211,7 +211,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge organizations</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -223,8 +223,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/organization</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/organization</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -238,7 +238,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge projects</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -250,8 +250,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/project</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/project</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--priority</arg><arg>${priority}</arg>
@ -265,7 +265,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Merge relations</name>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -277,8 +277,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
<arg>--betaInputPath</arg><arg>${betaInputGraphPath}/relation</arg>
<arg>--prodInputPath</arg><arg>${prodInputGraphPath}/relation</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--priority</arg><arg>${priority}</arg>

View File

@ -19,7 +19,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -15,7 +15,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Datasource;
public class MergeGraphSparkJobTest {
public class MergeGraphTableSparkJobTest {
private ObjectMapper mapper;
@ -28,7 +28,7 @@ public class MergeGraphSparkJobTest {
public void testMergeDatasources() throws IOException {
assertEquals(
"openaire-cris_1.1",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_cris.json"),
d("datasource_UNKNOWN.json"))
@ -36,7 +36,7 @@ public class MergeGraphSparkJobTest {
.getClassid());
assertEquals(
"openaire-cris_1.1",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_UNKNOWN.json"),
d("datasource_cris.json"))
@ -44,7 +44,7 @@ public class MergeGraphSparkJobTest {
.getClassid());
assertEquals(
"driver-openaire2.0",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_native.json"),
d("datasource_driver-openaire2.0.json"))
@ -52,7 +52,7 @@ public class MergeGraphSparkJobTest {
.getClassid());
assertEquals(
"driver-openaire2.0",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_driver-openaire2.0.json"),
d("datasource_native.json"))
@ -60,7 +60,7 @@ public class MergeGraphSparkJobTest {
.getClassid());
assertEquals(
"openaire4.0",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_notCompatible.json"),
d("datasource_openaire4.0.json"))
@ -68,7 +68,7 @@ public class MergeGraphSparkJobTest {
.getClassid());
assertEquals(
"notCompatible",
MergeGraphSparkJob
MergeGraphTableSparkJob
.mergeDatasource(
d("datasource_notCompatible.json"),
d("datasource_UNKNOWN.json"))

View File

@ -70,7 +70,7 @@ public class GenerateEntitiesApplicationTest {
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
String resultType) {
final Result merge = GenerateEntitiesApplication.mergeResults(publication, dataset);
final Result merge = OafMapperUtils.mergeResults(publication, dataset);
assertTrue(clazz.isAssignableFrom(merge.getClass()));
assertEquals(resultType, merge.getResulttype().getClassid());
}

View File

@ -72,6 +72,8 @@ public class MappersTest {
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertFalse(p.getDataInfo().getInvisible());
assertTrue(p.getSource().size() == 1);
assertTrue(StringUtils.isNotBlank(p.getDateofcollection()));
assertTrue(StringUtils.isNotBlank(p.getDateoftransformation()));
assertTrue(p.getAuthor().size() > 0);
final Optional<Author> author = p
@ -317,7 +319,7 @@ public class MappersTest {
@Test
void testODFRecord() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
@ -328,6 +330,22 @@ public class MappersTest {
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
}
@Test
void testTextGrid() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Dataset p = (Dataset) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
System.out.println(p.getTitle().get(0).getValue());
}
private void assertValidId(final String id) {
assertEquals(49, id.length());
assertEquals('|', id.charAt(2));

View File

@ -28,13 +28,7 @@ import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
@ExtendWith(MockitoExtension.class)
public class MigrateDbEntitiesApplicationTest {

View File

@ -31,8 +31,8 @@
},
{
"field": "trust",
"type": "string",
"value": "0.9"
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",

View File

@ -114,8 +114,8 @@
},
{
"field": "trust",
"type": "string",
"value": "0.9"
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",

View File

@ -7,13 +7,12 @@
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
<dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
<dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
<dri:dateOfCollection>2020-03-23T00:20:51.392Z</dri:dateOfCollection>
<dri:dateOfTransformation>2020-03-23T00:26:59.078Z</dri:dateOfTransformation>
<oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">

View File

@ -96,8 +96,8 @@
},
{
"field": "trust",
"type": "string",
"value": "0.9"
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",

View File

@ -41,8 +41,8 @@
},
{
"field": "trust",
"type": "string",
"value": "0.9"
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",

View File

@ -86,8 +86,8 @@
},
{
"field": "trust",
"type": "string",
"value": "0.9"
"type": "double",
"value": 0.9
},
{
"field": "inferenceprovenance",

View File

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<oai:header xmlns="http://namespace.openaire.eu/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<dri:objIdentifier>r3f52792889d::000051aa1f61d77d2c0b340091f8024e</dri:objIdentifier>
<dri:recordIdentifier>textgrid:q9cv.0</dri:recordIdentifier>
<dri:dateOfCollection>2020-11-17T09:34:11.128+01:00</dri:dateOfCollection>
<oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:q9cv.0</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-21T13:35:20Z</datestamp>
<dr:dateOfTransformation>2020-11-17T09:46:21.551+01:00</dr:dateOfTransformation>
</oai:header>
<metadata>
<datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-3"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-7664-F</datacite:identifier>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>Hoffmann von Fallersleben, August Heinrich</datacite:creatorName>
<datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://de.dariah.eu/pnd-service">118552589</datacite:nameIdentifier>
</datacite:creator>
</datacite:creators>
<datacite:titles>
<datacite:title titleType="Other">Mailied</datacite:title>
<datacite:title titleType="Other">August Heinrich Hoffmann von Fallersleben: Unpolitische Lieder von Hoffmann von Fallersleben, 1. + 2. Theil, 1. Theil, Hamburg: Hoffmann und Campe, 1841.</datacite:title>
</datacite:titles>
<datacite:publisher>TextGrid</datacite:publisher>
<datacite:publicationYear>2012</datacite:publicationYear>
<datacite:contributors>
<datacite:contributor contributorType="DataManager">
<datacite:contributorName>tvitt@textgrid.de</datacite:contributorName>
</datacite:contributor>
<datacite:contributor contributorType="Other">
<datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
<datacite:nameIdentifier nameIdentifierScheme="textgrid">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
</datacite:contributor>
</datacite:contributors>
<datacite:dates>
<datacite:date dateType="Created">2012-01-21T13:35:20Z</datacite:date>
<datacite:date dateType="Issued">2012-01-21T13:35:20Z</datacite:date>
<datacite:date dateType="Updated">2012-01-21T13:35:20Z</datacite:date>
</datacite:dates>
<datacite:resourceType resourceTypeGeneral="Dataset"/>
<alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="URI">textgrid:q9cv.0</datacite:alternateIdentifier>
<alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-7664-F</alternateIdentifier>
</alternateIdentifiers>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-7666-B</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<datacite:sizes>
<datacite:size>527 Bytes</datacite:size>
</datacite:sizes>
<datacite:formats>
<datacite:format>text/tg.edition+tg.aggregation+xml</datacite:format>
</datacite:formats>
<datacite:version>0</datacite:version>
<datacite:rightsList>
<datacite:rights rightsURI="http://creativecommons.org/licenses/by/3.0/de/legalcode"> Der annotierte Datenbestand der Digitalen Bibliothek inklusive
Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung
des Datenbestandes von www.editura.de durch TextGrid und werden
unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland
Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich
nicht auf die der Annotation zu Grunde liegenden allgemeinfreien
Texte (Siehe auch Punkt 2 der Lizenzbestimmungen).</datacite:rights>
<datacite:rights rightsURI="info:eu-repo/semantics/openAccess"/>
</datacite:rightsList>
<datacite:descriptions>
<datacite:description descriptionType="Abstract"/>
</datacite:descriptions>
<datacite:geoLocations>
<datacite:geoLocation>
<datacite:geoLocationPlace
xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Hamburg</datacite:geoLocationPlace>
</datacite:geoLocation>
</datacite:geoLocations>
</datacite:resource>
<oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-7664-F</oaf:identifier>
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
<oaf:refereed>0002</oaf:refereed>
<oaf:dateAccepted>2012-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
<oaf:language>und</oaf:language>
<oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
<oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-11-17T09:34:11.128+01:00">
<baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
<identifier>textgrid:q9cv.0</identifier>
<datestamp>2012-01-21T13:35:20Z</datestamp>
<metadataNamespace>http://schema.datacite.org/oai/oai-1.0/</metadataNamespace>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk"
classname="sysimport:crosswalk"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -22,6 +22,12 @@
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>dom4j</groupId>
@ -82,9 +88,6 @@
<groupId>org.codehaus.woodstox</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>com.github.ben-manes.caffeine</groupId>
<artifactId>*</artifactId>
@ -109,11 +112,10 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.oa.provision;
public class ProvisionConstants {
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";
public static final String SEPARATOR = "-";
public static String getCollectionName(String format) {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
}

View File

@ -14,11 +14,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.dhp.oa.provision.utils.ZkServers;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class SolrAdminApplication extends SolrApplication implements Closeable {
public class SolrAdminApplication implements Closeable {
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
@ -54,12 +55,12 @@ public class SolrAdminApplication extends SolrApplication implements Closeable {
.orElse(false);
log.info("commit: {}", commit);
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
final String zkHost = getZkHost(isLookup);
final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost);
final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection);
try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {

View File

@ -1,40 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public abstract class SolrApplication {
private static final Logger log = LoggerFactory.getLogger(SolrApplication.class);
protected static final String LAYOUT = "index";
protected static final String INTERPRETATION = "openaire";
protected static final String SEPARATOR = "-";
protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
/**
* Method retrieves from the information system the zookeeper quorum of the Solr server
*
* @param isLookup
* @return the zookeeper quorum of the Solr server
* @throws ISLookUpException
*/
protected static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
return doLookup(
isLookup,
"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
}
protected static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
log.info(String.format("running xquery: %s", xquery));
final String res = isLookup.getResourceProfileByQuery(xquery);
log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
return res;
}
}

View File

@ -2,12 +2,11 @@
package eu.dnetlib.dhp.oa.provision;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
@ -28,13 +27,11 @@ import com.google.common.collect.Maps;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.*;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.Seq;
/**
* XmlConverterJob converts the JoinedEntities as XML records
@ -43,8 +40,6 @@ public class XmlConverterJob {
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
public static void main(String[] args) throws Exception {
@ -129,10 +124,6 @@ public class XmlConverterJob {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) {
Map<String, LongAccumulator> accumulators = Maps.newHashMap();
accumulators

View File

@ -20,27 +20,42 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.lucidworks.spark.util.SolrSupport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class XmlIndexingJob extends SolrApplication {
public class XmlIndexingJob {
private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);
private static final Integer DEFAULT_BATCH_SIZE = 1000;
protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
private String inputPath;
private String format;
private int batchSize;
private String outputPath;
private SparkSession spark;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -60,27 +75,53 @@ public class XmlIndexingJob extends SolrApplication {
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String format = parser.get("format");
log.info("format: {}", format);
final String outputPath = Optional
.ofNullable(parser.get("outputPath"))
.orElse(null);
log.info("outputPath: {}", outputPath);
final Integer batchSize = parser.getObjectMap().containsKey("batchSize")
? Integer.valueOf(parser.get("batchSize"))
: DEFAULT_BATCH_SIZE;
log.info("batchSize: {}", batchSize);
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String fields = getLayoutSource(isLookup, format);
final SparkConf conf = new SparkConf();
conf.registerKryoClasses(new Class[] {
SerializableSolrInputDocument.class
});
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new XmlIndexingJob(spark, inputPath, format, batchSize, outputPath).run(isLookup);
});
}
public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize, String outputPath) {
this.spark = spark;
this.inputPath = inputPath;
this.format = format;
this.batchSize = batchSize;
this.outputPath = outputPath;
}
public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
final String fields = isLookup.getLayoutSource(format);
log.info("fields: {}", fields);
final String xslt = getLayoutTransformer(isLookup);
final String xslt = isLookup.getLayoutTransformer();
final String dsId = getDsId(format, isLookup);
final String dsId = isLookup.getDsId(format);
log.info("dsId: {}", dsId);
final String zkHost = getZkHost(isLookup);
final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost);
final String version = getRecordDatestamp();
@ -88,24 +129,26 @@ public class XmlIndexingJob extends SolrApplication {
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
log.info("indexRecordTransformer {}", indexRecordXslt);
final SparkConf conf = new SparkConf();
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<SolrInputDocument> docs = sc
.sequenceFile(inputPath, Text.class, Text.class)
.map(t -> t._2().toString())
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s));
RDD<SolrInputDocument> docs = sc
.sequenceFile(inputPath, Text.class, Text.class)
.map(t -> t._2().toString())
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
.rdd();
final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
});
if (StringUtils.isNotBlank(outputPath)) {
spark
.createDataset(
docs.map(s -> new SerializableSolrInputDocument(s)).rdd(),
Encoders.kryo(SerializableSolrInputDocument.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
} else {
final String collection = ProvisionConstants.getCollectionName(format);
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
}
}
protected static String toIndexRecord(Transformer tr, final String record) {
@ -151,56 +194,4 @@ public class XmlIndexingJob extends SolrApplication {
return new SimpleDateFormat(DATE_FORMAT).format(new Date());
}
/**
* Method retrieves from the information system the list of fields associated to the given MDFormat name
*
* @param isLookup the ISLookup service stub
* @param format the Metadata format name
* @return the string representation of the list of fields to be indexed
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
private static String getLayoutSource(final ISLookUpService isLookup, final String format)
throws ISLookUpDocumentNotFoundException, ISLookUpException {
return doLookup(
isLookup,
String
.format(
"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
format, LAYOUT));
}
/**
* Method retrieves from the information system the openaireLayoutToRecordStylesheet
*
* @param isLookup the ISLookup service stub
* @return the string representation of the XSLT contained in the transformation rule profile
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
return doLookup(
isLookup,
"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
}
/**
* Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
*
* @param format
* @param isLookup
* @return the IndexDS identifier
* @throws ISLookUpException
*/
private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
return doLookup(
isLookup,
String
.format(
"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
format));
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.oa.provision.model;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
/**
* Wrapper class needed to make the SolrInputDocument compatible with the Kryo serialization mechanism.
*/
public class SerializableSolrInputDocument extends SolrInputDocument {
public SerializableSolrInputDocument() {
super(new HashMap<>());
}
public SerializableSolrInputDocument(Map<String, SolrInputField> fields) {
super(fields);
}
}

View File

@ -0,0 +1,95 @@
package eu.dnetlib.dhp.oa.provision.utils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class ISLookupClient {
private static final Logger log = LoggerFactory.getLogger(ISLookupClient.class);
private ISLookUpService isLookup;
public ISLookupClient(ISLookUpService isLookup) {
this.isLookup = isLookup;
}
/**
* Method retrieves from the information system the list of fields associated to the given MDFormat name
*
* @param format the Metadata format name
* @return the string representation of the list of fields to be indexed
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
public String getLayoutSource(final String format)
throws ISLookUpDocumentNotFoundException, ISLookUpException {
return doLookup(
String
.format(
"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
format, ProvisionConstants.LAYOUT));
}
/**
* Method retrieves from the information system the openaireLayoutToRecordStylesheet
*
* @return the string representation of the XSLT contained in the transformation rule profile
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
public String getLayoutTransformer() throws ISLookUpException {
return doLookup(
"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
}
/**
* Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
*
* @param format
* @return the IndexDS identifier
* @throws ISLookUpException
*/
public String getDsId(String format) throws ISLookUpException {
return doLookup(
String
.format(
"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
format));
}
/**
* Method retrieves from the information system the zookeeper quorum of the Solr server
*
* @return the zookeeper quorum of the Solr server
* @throws ISLookUpException
*/
public String getZkHost() throws ISLookUpException {
return doLookup(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
}
private String doLookup(String xquery) throws ISLookUpException {
log.info(String.format("running xquery: %s", xquery));
final String res = getIsLookup().getResourceProfileByQuery(xquery);
log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
return res;
}
public ISLookUpService getIsLookup() {
return isLookup;
}
public void setIsLookup(ISLookUpService isLookup) {
this.isLookup = isLookup;
}
}

View File

@ -46,11 +46,6 @@ public class StreamingInputDocumentFactory {
private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
private static final String outFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
private static final List<String> dateFormats = Arrays
.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
private static final String DEFAULTDNETRESULT = "dnetResult";
private static final String TARGETFIELDS = "targetFields";
@ -125,13 +120,12 @@ public class StreamingInputDocumentFactory {
}
if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
indexDocument.clear();
System.err.println("missing indexrecord id:\n" + inputDocument);
throw new IllegalStateException("cannot extract record ID from: " + inputDocument);
}
return indexDocument;
} catch (XMLStreamException e) {
return new SolrInputDocument();
throw new IllegalStateException(e);
}
}

View File

@ -22,5 +22,11 @@
"paramLongName": "batchSize",
"paramDescription": "size of the batch of documents sent to solr",
"paramRequired": false
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "path on hdfs activating an alternative output for the SolrInputDocuments",
"paramRequired": false
}
]

View File

@ -638,6 +638,7 @@
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--batchSize</arg><arg>${batchSize}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="commit_solr_collection"/>
<error to="Kill"/>

View File

@ -1,107 +1,18 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.File;
import java.nio.file.Path;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.embedded.JettyConfig;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.RequestWriter;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.ConfigSetAdminResponse;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import junit.framework.Assert;
public class SolrAdminApplicationTest {
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplicationTest.class);
public static final String DEFAULT_COLLECTION = "testCollection";
public static final String CONFIG_NAME = "testConfig";
private static MiniSolrCloudCluster miniCluster;
private static CloudSolrClient cloudSolrClient;
@TempDir
public static Path tempDir;
@BeforeAll
public static void setup() throws Exception {
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, tempDir, jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
cloudSolrClient = miniCluster.getSolrClient();
cloudSolrClient.setRequestWriter(new RequestWriter());
cloudSolrClient.setParser(new XMLResponseParser());
cloudSolrClient.setDefaultCollection(DEFAULT_COLLECTION);
cloudSolrClient.connect();
log.info(new ConfigSetAdminRequest.List().process(cloudSolrClient).toString());
log.info(CollectionAdminRequest.ClusterStatus.getClusterStatus().process(cloudSolrClient).toString());
createCollection(cloudSolrClient, DEFAULT_COLLECTION, 2, 1, CONFIG_NAME);
}
@AfterAll
public static void shutDown() throws Exception {
miniCluster.shutdown();
}
protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
int replicationFactor, String configName) throws Exception {
ModifiableSolrParams modParams = new ModifiableSolrParams();
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
modParams.set("name", name);
modParams.set("numShards", numShards);
modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
public class SolrAdminApplicationTest extends SolrTest {
@Test
public void testPing() throws Exception {
SolrPingResponse pingResponse = cloudSolrClient.ping();
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
log.info("pingResponse: '{}'", pingResponse.getStatus());
Assert.assertTrue(pingResponse.getStatus() == 0);
}

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.File;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.embedded.JettyConfig;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class SolrTest {
protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
protected static final String FORMAT = "test";
protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
protected static final String CONFIG_NAME = "testConfig";
protected static MiniSolrCloudCluster miniCluster;
@TempDir
public static Path workingDir;
@BeforeAll
public static void setup() throws Exception {
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
log.info(String.format("working directory: %s", workingDir.toString()));
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
System.setProperty("solr.lock.type", "single");
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
}
@AfterAll
public static void shutDown() throws Exception {
miniCluster.shutdown();
FileUtils.deleteDirectory(workingDir.toFile());
}
protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
ModifiableSolrParams modParams = new ModifiableSolrParams();
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
modParams.set("name", name);
modParams.set("numShards", numShards);
modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
modParams.set("maxShardsPerNode", maxShardsPerNode);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
}

View File

@ -0,0 +1,147 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.CommonParams;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class XmlIndexingJobTest extends SolrTest {
protected static SparkSession spark;
private static final Integer batchSize = 100;
@Mock
private ISLookUpService isLookUpService;
@Mock
private ISLookupClient isLookupClient;
@BeforeEach
public void prepareMocks() throws ISLookUpException, IOException {
isLookupClient.setIsLookup(isLookUpService);
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
Mockito
.when(isLookupClient.getLayoutTransformer())
.thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
}
@BeforeAll
public static void before() {
SparkConf conf = new SparkConf();
conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
conf.registerKryoClasses(new Class[] {
SerializableSolrInputDocument.class
});
conf.setMaster("local[1]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
spark = SparkSession
.builder()
.appName(XmlIndexingJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void tearDown() {
spark.stop();
}
@Test
public void testXmlIndexingJob_onSolr() throws Exception {
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
long nRecord = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputPath, Text.class, Text.class)
.count();
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, null).run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
Assertions
.assertEquals(
nRecord, rsp.getResults().getNumFound(),
"the number of indexed records should be equal to the number of input records");
}
@Test
public void testXmlIndexingJob_saveOnHDFS() throws Exception {
final String ID_XPATH = "//header/*[local-name()='objIdentifier']";
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
final JavaPairRDD<Text, Text> xmlRecords = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputPath, Text.class, Text.class);
long nRecord = xmlRecords.count();
long xmlIdUnique = xmlRecords
.map(t -> t._2().toString())
.map(s -> new SAXReader().read(new StringReader(s)).valueOf(ID_XPATH))
.distinct()
.count();
Assertions.assertEquals(nRecord, xmlIdUnique, "IDs should be unique among input records");
final String outputPath = workingDir.resolve("outputPath").toAbsolutePath().toString();
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, outputPath).run(isLookupClient);
final Dataset<SerializableSolrInputDocument> solrDocs = spark
.read()
.load(outputPath)
.as(Encoders.kryo(SerializableSolrInputDocument.class));
long docIdUnique = solrDocs.map((MapFunction<SerializableSolrInputDocument, String>) doc -> {
final SolrInputField id = doc.getField("__indexrecordidentifier");
return id.getFirstValue().toString();
}, Encoders.STRING())
.distinct()
.count();
Assertions.assertEquals(xmlIdUnique, docIdUnique, "IDs should be unique among the output records");
}
}

View File

@ -105,7 +105,7 @@
<FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
<FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
<FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
<FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
@ -123,7 +123,8 @@
<FIELD indexable="true" name="relfundername" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@name)"/>
<FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
<FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
<FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/><!-- COMMON FIELDS -->
<FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
<FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
<FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
<FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
<FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>

View File

@ -0,0 +1,31 @@
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- If this file is found in the config directory, it will only be
loaded once at startup. If it is found in Solr's data
directory, it will be re-loaded every commit.
See http://wiki.apache.org/solr/QueryElevationComponent for more info
-->
<elevate>
<!-- Query elevation examples
<query text="foo bar">
<doc id="1" />
<doc id="2" />
<doc id="3" />
</query>
for use with techproducts example
<query text="ipod">
<doc id="MA147LL/A" /> put the actual ipod at the top
<doc id="IW-02" exclude="true" /> exclude this cable
</query>
-->
</elevate>

View File

@ -83,6 +83,7 @@
<lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
<!-- an exact 'path' can be used instead of a 'dir' to specify a
specific jar file. This will cause a serious error to be logged
if it can't be loaded.
@ -112,7 +113,8 @@
One can force a particular implementation via solr.MMapDirectoryFactory,
solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.
solr.RAMDirectoryFactory is memory based and not persistent.
solr.RAMDirectoryFactory is memory based, not
persistent, and doesn't work with replication.
-->
<directoryFactory name="DirectoryFactory"
class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
@ -204,7 +206,7 @@
More details on the nuances of each LockFactory...
http://wiki.apache.org/lucene-java/AvailableLockFactories
-->
<lockType>${solr.lock.type:single}</lockType>
<lockType>${solr.lock.type:native}</lockType>
<!-- Commit Deletion Policy
Custom deletion policies can be specified here. The class must
@ -331,6 +333,29 @@
postCommit - fired after every commit or optimize command
postOptimize - fired after every optimize command
-->
<!-- The RunExecutableListener executes an external command from a
hook such as postCommit or postOptimize.
exe - the name of the executable to run
dir - dir to use as the current working directory. (default=".")
wait - the calling thread waits until the executable returns.
(default="true")
args - the arguments to pass to the program. (default is none)
env - environment variables to set. (default is none)
-->
<!-- This example shows how RunExecutableListener could be used
with the script based replication...
http://wiki.apache.org/solr/CollectionDistribution
-->
<!--
<listener event="postCommit" class="solr.RunExecutableListener">
<str name="exe">solr/bin/snapshooter</str>
<str name="dir">.</str>
<bool name="wait">true</bool>
<arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
<arr name="env"> <str>MYVAR=val1</str> </arr>
</listener>
-->
</updateHandler>
@ -366,14 +391,22 @@
Query section - these settings control query time things like caches
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
<query>
<!-- Max Boolean Clauses
Maximum number of clauses in each BooleanQuery, an exception
is thrown if exceeded.
** WARNING **
This option actually modifies a global Lucene property that
will affect all SolrCores. If multiple solrconfig.xml files
disagree on this property, the value at any given moment will
be based on the last SolrCore to be initialized.
<!-- Maximum number of clauses in each BooleanQuery, an exception
is thrown if exceeded. It is safe to increase or remove this setting,
since it is purely an arbitrary limit to try and catch user errors where
large boolean queries may not be the best implementation choice.
-->
<maxBooleanClauses>1024</maxBooleanClauses>
<!-- Solr Internal Query Caches
There are two implementations of cache available for Solr,
@ -575,8 +608,21 @@
This section contains instructions for how the SolrDispatchFilter
should behave when processing requests for this SolrCore.
handleSelect is a legacy option that affects the behavior of requests
such as /select?qt=XXX
handleSelect="true" will cause the SolrDispatchFilter to process
the request and dispatch the query to a handler specified by the
"qt" param, assuming "/select" isn't already registered.
handleSelect="false" will cause the SolrDispatchFilter to
ignore "/select" requests, resulting in a 404 unless a handler
is explicitly registered with the name "/select"
handleSelect="true" is not recommended for new users, but is the default
for backwards compatibility
-->
<requestDispatcher>
<requestDispatcher handleSelect="false" >
<!-- Request Parsing
These settings indicate how Solr Requests may be parsed, and
@ -602,14 +648,15 @@
plugins.
*** WARNING ***
Before enabling remote streaming, you should make sure your
system has authentication enabled.
The settings below authorize Solr to fetch remote files, You
should make sure your system has some authentication before
using enableRemoteStreaming="true"
<requestParsers enableRemoteStreaming="false"
multipartUploadLimitInKB="-1"
formdataUploadLimitInKB="-1"
addHttpRequestToContext="false"/>
-->
<requestParsers enableRemoteStreaming="true"
multipartUploadLimitInKB="2048000"
formdataUploadLimitInKB="2048"
addHttpRequestToContext="false"/>
<!-- HTTP Caching
@ -673,6 +720,14 @@
Incoming queries will be dispatched to a specific handler by name
based on the path specified in the request.
Legacy behavior: If the request path uses "/select" but no Request
Handler has that name, and if handleSelect="true" has been specified in
the requestDispatcher, then the Request Handler is dispatched based on
the qt parameter. Handlers without a leading '/' are accessed this way
like so: http://host/app/[core/]select?qt=name If no qt is
given, then the requestHandler that declares default="true" will be
used or the one named "standard".
If a Request Handler is declared with startup="lazy", then it will
not be initialized until the first request that uses it.
@ -692,13 +747,9 @@
-->
<lst name="defaults">
<str name="echoParams">explicit</str>
<str name="q.op">AND</str>
<int name="rows">10</int>
<!-- Default search field
<str name="df">text</str>
-->
<!-- Change from JSON to XML format (the default prior to Solr 7.0)
<str name="wt">xml</str>
-->
<!-- <str name="df">text</str> -->
</lst>
<!-- In addition to defaults, "appends" params can be specified
to identify values which should be appended to the list of
@ -781,10 +832,18 @@
<initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
<lst name="defaults">
<str name="df">_text_</str>
<str name="df">__all</str>
</lst>
</initParams>
<!-- This enabled schemaless mode
<initParams path="/update/**">
<lst name="defaults">
<str name="update.chain">add-unknown-fields-to-the-schema</str>
</lst>
</initParams>
-->
<!-- Solr Cell Update Request Handler
http://wiki.apache.org/solr/ExtractingRequestHandler
@ -796,10 +855,9 @@
<lst name="defaults">
<str name="lowernames">true</str>
<str name="fmap.meta">ignored_</str>
<str name="fmap.content">_text_</str>
<str name="fmap.content">__all</str>
</lst>
</requestHandler>
<!-- Search Components
Search components are registered to SolrCore and used by
@ -861,7 +919,7 @@
<!-- a spellchecker built from a field of the main index -->
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">_text_</str>
<str name="field">__all</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<!-- the spellcheck distance measure used, the default is the internal levenshtein -->
<str name="distanceMeasure">internal</str>
@ -986,6 +1044,7 @@
<searchComponent name="elevator" class="solr.QueryElevationComponent" >
<!-- pick a fieldType to analyze queries -->
<str name="queryFieldType">string</str>
<str name="config-file">elevate.xml</str>
</searchComponent>
<!-- A request handler for demonstrating the elevator component -->
@ -1116,81 +1175,70 @@
<!-- Add unknown fields to the schema
Field type guessing update processors that will
An example field type guessing update processor that will
attempt to parse string-typed field values as Booleans, Longs,
Doubles, or Dates, and then add schema fields with the guessed
field types. Text content will be indexed as "text_general" as
well as a copy to a plain string version in *_str.
field types.
These require that the schema is both managed and mutable, by
This requires that the schema is both managed and mutable, by
declaring schemaFactory as ManagedIndexSchemaFactory, with
mutable specified as true.
See http://wiki.apache.org/solr/GuessingFieldTypes
-->
<updateProcessor class="solr.UUIDUpdateProcessorFactory" name="uuid"/>
<updateProcessor class="solr.RemoveBlankFieldUpdateProcessorFactory" name="remove-blank"/>
<updateProcessor class="solr.FieldNameMutatingUpdateProcessorFactory" name="field-name-mutating">
<str name="pattern">[^\w-\.]</str>
<str name="replacement">_</str>
</updateProcessor>
<updateProcessor class="solr.ParseBooleanFieldUpdateProcessorFactory" name="parse-boolean"/>
<updateProcessor class="solr.ParseLongFieldUpdateProcessorFactory" name="parse-long"/>
<updateProcessor class="solr.ParseDoubleFieldUpdateProcessorFactory" name="parse-double"/>
<updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
<arr name="format">
<str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ssZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss</str>
<str>yyyy-MM-dd'T'HH:mmZ</str>
<str>yyyy-MM-dd'T'HH:mm</str>
<str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss.SSS</str>
<str>yyyy-MM-dd HH:mm:ss,SSS</str>
<str>yyyy-MM-dd HH:mm:ssZ</str>
<str>yyyy-MM-dd HH:mm:ss</str>
<str>yyyy-MM-dd HH:mmZ</str>
<str>yyyy-MM-dd HH:mm</str>
<str>yyyy-MM-dd</str>
</arr>
</updateProcessor>
<updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">
<lst name="typeMapping">
<str name="valueClass">java.lang.String</str>
<str name="fieldType">text_general</str>
<lst name="copyField">
<str name="dest">*_str</str>
<int name="maxChars">256</int>
<updateRequestProcessorChain name="add-unknown-fields-to-the-schema">
<!-- UUIDUpdateProcessorFactory will generate an id if none is present in the incoming document -->
<processor class="solr.UUIDUpdateProcessorFactory" />
<processor class="solr.RemoveBlankFieldUpdateProcessorFactory"/>
<processor class="solr.FieldNameMutatingUpdateProcessorFactory">
<str name="pattern">[^\w-\.]</str>
<str name="replacement">_</str>
</processor>
<processor class="solr.ParseBooleanFieldUpdateProcessorFactory"/>
<processor class="solr.ParseLongFieldUpdateProcessorFactory"/>
<processor class="solr.ParseDoubleFieldUpdateProcessorFactory"/>
<processor class="solr.ParseDateFieldUpdateProcessorFactory">
<arr name="format">
<str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
<str>yyyy-MM-dd'T'HH:mm:ssZ</str>
<str>yyyy-MM-dd'T'HH:mm:ss</str>
<str>yyyy-MM-dd'T'HH:mmZ</str>
<str>yyyy-MM-dd'T'HH:mm</str>
<str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
<str>yyyy-MM-dd HH:mm:ss.SSS</str>
<str>yyyy-MM-dd HH:mm:ss,SSS</str>
<str>yyyy-MM-dd HH:mm:ssZ</str>
<str>yyyy-MM-dd HH:mm:ss</str>
<str>yyyy-MM-dd HH:mmZ</str>
<str>yyyy-MM-dd HH:mm</str>
<str>yyyy-MM-dd</str>
</arr>
</processor>
<processor class="solr.AddSchemaFieldsUpdateProcessorFactory">
<str name="defaultFieldType">strings</str>
<lst name="typeMapping">
<str name="valueClass">java.lang.Boolean</str>
<str name="fieldType">booleans</str>
</lst>
<!-- Use as default mapping instead of defaultFieldType -->
<bool name="default">true</bool>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.lang.Boolean</str>
<str name="fieldType">booleans</str>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.util.Date</str>
<str name="fieldType">pdates</str>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.lang.Long</str>
<str name="valueClass">java.lang.Integer</str>
<str name="fieldType">plongs</str>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.lang.Number</str>
<str name="fieldType">pdoubles</str>
</lst>
</updateProcessor>
<lst name="typeMapping">
<str name="valueClass">java.util.Date</str>
<str name="fieldType">tdates</str>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.lang.Long</str>
<str name="valueClass">java.lang.Integer</str>
<str name="fieldType">tlongs</str>
</lst>
<lst name="typeMapping">
<str name="valueClass">java.lang.Number</str>
<str name="fieldType">tdoubles</str>
</lst>
</processor>
<!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
<updateRequestProcessorChain name="add-unknown-fields-to-the-schema" default="${update.autoCreateFields:true}"
processor="uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date,add-schema-fields">
<processor class="solr.LogUpdateProcessorFactory"/>
<processor class="solr.DistributedUpdateProcessorFactory"/>
<processor class="solr.RunUpdateProcessorFactory"/>
@ -1313,7 +1361,7 @@
<!-- Query Parsers
https://lucene.apache.org/solr/guide/query-syntax-and-parsing.html
https://cwiki.apache.org/confluence/display/solr/Query+Syntax+and+Parsing
Multiple QParserPlugins can be registered by name, and then
used in either the "defType" param for the QueryComponent (used

View File

@ -46,7 +46,7 @@
</configuration>
</global>
<start to="Step18"/>
<start to="Step1"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>

26
pom.xml
View File

@ -50,7 +50,7 @@
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
<snapshots>
<enabled>false</enabled>
@ -70,6 +70,26 @@
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>dnet45-releases-old</id>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>dnet45-snapshots-old</id>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
@ -639,12 +659,12 @@
<snapshotRepository>
<id>dnet45-snapshots</id>
<name>DNet45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
</snapshotRepository>
<repository>
<id>dnet45-releases</id>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
</distributionManagement>
<reporting>