forked from D-Net/dnet-hadoop
Merge branch 'deduptesting'
This commit is contained in:
commit
8d2102d7d2
|
@ -0,0 +1,57 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Encoder;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
public class RelationAggregator extends Aggregator<Relation, Relation, Relation> {
|
||||||
|
|
||||||
|
private static Relation ZERO = new Relation();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation zero() {
|
||||||
|
return ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation reduce(Relation b, Relation a) {
|
||||||
|
return mergeRel(b, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation merge(Relation b, Relation a) {
|
||||||
|
return mergeRel(b, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation finish(Relation r) {
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Relation mergeRel(Relation b, Relation a) {
|
||||||
|
if (Objects.equals(b, ZERO)) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
if (Objects.equals(a, ZERO)) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
b.mergeFrom(a);
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Encoder<Relation> bufferEncoder() {
|
||||||
|
return Encoders.kryo(Relation.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Encoder<Relation> outputEncoder() {
|
||||||
|
return Encoders.kryo(Relation.class);
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,11 +5,13 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.graphx.Edge;
|
import org.apache.spark.graphx.Edge;
|
||||||
import org.apache.spark.rdd.RDD;
|
import org.apache.spark.rdd.RDD;
|
||||||
|
@ -75,7 +77,11 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
final String actionSetId = parser.get("actionSetId");
|
final String actionSetId = parser.get("actionSetId");
|
||||||
|
int cut = Optional
|
||||||
|
.ofNullable(parser.get("cutConnectedComponent"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(0);
|
||||||
|
log.info("connected component cut: '{}'", cut);
|
||||||
log.info("graphBasePath: '{}'", graphBasePath);
|
log.info("graphBasePath: '{}'", graphBasePath);
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
|
@ -100,8 +106,10 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = spark
|
final RDD<Edge<String>> edgeRdd = spark
|
||||||
.read()
|
.read()
|
||||||
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
.textFile(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
||||||
.as(Encoders.bean(Relation.class))
|
.map(
|
||||||
|
(MapFunction<String, Relation>) r -> OBJECT_MAPPER.readValue(r, Relation.class),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
|
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
|
||||||
.rdd();
|
.rdd();
|
||||||
|
@ -109,7 +117,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
final Dataset<Relation> mergeRels = spark
|
final Dataset<Relation> mergeRels = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
GraphProcessor
|
GraphProcessor
|
||||||
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
|
.findCCs(vertexes.rdd(), edgeRdd, maxIterations, cut)
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(k -> k.getDocIds().size() > 1)
|
.filter(k -> k.getDocIds().size() > 1)
|
||||||
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
|
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
|
||||||
|
@ -117,6 +125,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath);
|
mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -34,6 +35,8 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class);
|
||||||
|
|
||||||
|
public static final int NUM_PARTITIONS = 1000;
|
||||||
|
|
||||||
public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) {
|
public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) {
|
||||||
super(parser, spark);
|
super(parser, spark);
|
||||||
}
|
}
|
||||||
|
@ -48,13 +51,6 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
|
||||||
conf
|
|
||||||
.registerKryoClasses(
|
|
||||||
new Class[] {
|
|
||||||
MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class
|
|
||||||
});
|
|
||||||
|
|
||||||
new SparkCreateSimRels(parser, getSparkSession(conf))
|
new SparkCreateSimRels(parser, getSparkSession(conf))
|
||||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||||
}
|
}
|
||||||
|
@ -68,7 +64,12 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
final String actionSetId = parser.get("actionSetId");
|
final String actionSetId = parser.get("actionSetId");
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
|
final int numPartitions = Optional
|
||||||
|
.ofNullable(parser.get("numPartitions"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(NUM_PARTITIONS);
|
||||||
|
|
||||||
|
log.info("numPartitions: '{}'", numPartitions);
|
||||||
log.info("graphBasePath: '{}'", graphBasePath);
|
log.info("graphBasePath: '{}'", graphBasePath);
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
|
@ -88,6 +89,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
|
.repartition(numPartitions)
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
(PairFunction<String, String, MapDocument>) s -> {
|
(PairFunction<String, String, MapDocument>) s -> {
|
||||||
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
||||||
|
@ -95,19 +97,17 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
});
|
});
|
||||||
|
|
||||||
// create blocks for deduplication
|
// create blocks for deduplication
|
||||||
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
JavaPairRDD<String, Block> blocks = Deduper
|
||||||
|
.createSortedBlocks(mapDocuments, dedupConf)
|
||||||
|
.repartition(numPartitions);
|
||||||
|
|
||||||
// create relations by comparing only elements in the same group
|
// create relations by comparing only elements in the same group
|
||||||
JavaRDD<Relation> relations = Deduper
|
Deduper
|
||||||
.computeRelations(sc, blocks, dedupConf)
|
.computeRelations(sc, blocks, dedupConf)
|
||||||
.map(t -> createSimRel(t._1(), t._2(), entity));
|
.map(t -> createSimRel(t._1(), t._2(), entity))
|
||||||
|
.repartition(numPartitions)
|
||||||
// save the simrel in the workingdir
|
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
|
||||||
spark
|
.saveAsTextFile(outputPath);
|
||||||
.createDataset(relations.rdd(), Encoders.bean(Relation.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Append)
|
|
||||||
.save(outputPath);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,11 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import static org.apache.spark.sql.functions.col;
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -95,7 +98,20 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
FieldType.TARGET,
|
FieldType.TARGET,
|
||||||
getDeletedFn());
|
getDeletedFn());
|
||||||
|
|
||||||
save(newRels.union(updated).union(mergeRels), outputRelationPath, SaveMode.Overwrite);
|
save(
|
||||||
|
distinctRelations(newRels
|
||||||
|
.union(updated)
|
||||||
|
.union(mergeRels)
|
||||||
|
.map((MapFunction<Relation, Relation>) r -> r, Encoders.kryo(Relation.class))),
|
||||||
|
outputRelationPath, SaveMode.Overwrite);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Dataset<Relation> distinctRelations(Dataset<Relation> rels) {
|
||||||
|
return rels
|
||||||
|
.filter(getRelationFilterFunction())
|
||||||
|
.groupByKey((MapFunction<Relation, String>) r -> String.join(r.getSource(), r.getTarget(), r.getRelType(), r.getSubRelType(), r.getRelClass()), Encoders.STRING())
|
||||||
|
.agg(new RelationAggregator().toColumn())
|
||||||
|
.map((MapFunction<Tuple2<String, Relation>, Relation>) t -> t._2(), Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset<Relation> processDataset(
|
private static Dataset<Relation> processDataset(
|
||||||
|
@ -112,6 +128,14 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
.map(mapFn, Encoders.bean(Relation.class));
|
.map(mapFn, Encoders.bean(Relation.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FilterFunction<Relation> getRelationFilterFunction() {
|
||||||
|
return (FilterFunction<Relation>) r -> StringUtils.isNotBlank(r.getSource()) ||
|
||||||
|
StringUtils.isNotBlank(r.getTarget()) ||
|
||||||
|
StringUtils.isNotBlank(r.getRelClass()) ||
|
||||||
|
StringUtils.isNotBlank(r.getSubRelType()) ||
|
||||||
|
StringUtils.isNotBlank(r.getRelClass());
|
||||||
|
}
|
||||||
|
|
||||||
private static MapFunction<String, Relation> patchRelFn() {
|
private static MapFunction<String, Relation> patchRelFn() {
|
||||||
return value -> {
|
return value -> {
|
||||||
final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class);
|
final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class);
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.dedup.graph;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
@ -18,12 +19,17 @@ public class ConnectedComponent implements Serializable {
|
||||||
private Set<String> docIds;
|
private Set<String> docIds;
|
||||||
private String ccId;
|
private String ccId;
|
||||||
|
|
||||||
public ConnectedComponent() {
|
public ConnectedComponent(Set<String> docIds, final int cut) {
|
||||||
}
|
|
||||||
|
|
||||||
public ConnectedComponent(Set<String> docIds) {
|
|
||||||
this.docIds = docIds;
|
this.docIds = docIds;
|
||||||
createID();
|
createID();
|
||||||
|
if (cut > 0 && docIds.size() > cut) {
|
||||||
|
this.docIds = docIds
|
||||||
|
.stream()
|
||||||
|
.filter(s -> !ccId.equalsIgnoreCase(s))
|
||||||
|
.limit(cut - 1)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
this.docIds.add(ccId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String createID() {
|
public String createID() {
|
||||||
|
@ -41,6 +47,7 @@ public class ConnectedComponent implements Serializable {
|
||||||
public String getMin() {
|
public String getMin() {
|
||||||
|
|
||||||
final StringBuilder min = new StringBuilder();
|
final StringBuilder min = new StringBuilder();
|
||||||
|
|
||||||
docIds
|
docIds
|
||||||
.forEach(
|
.forEach(
|
||||||
i -> {
|
i -> {
|
||||||
|
|
|
@ -7,7 +7,7 @@ import scala.collection.JavaConversions;
|
||||||
|
|
||||||
object GraphProcessor {
|
object GraphProcessor {
|
||||||
|
|
||||||
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
|
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int, cut:Int): RDD[ConnectedComponent] = {
|
||||||
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
||||||
val cc = graph.connectedComponents(maxIterations).vertices
|
val cc = graph.connectedComponents(maxIterations).vertices
|
||||||
|
|
||||||
|
@ -22,15 +22,15 @@ object GraphProcessor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
val connectedComponents = joinResult.groupByKey()
|
val connectedComponents = joinResult.groupByKey()
|
||||||
.map[ConnectedComponent](cc => asConnectedComponent(cc))
|
.map[ConnectedComponent](cc => asConnectedComponent(cc, cut))
|
||||||
connectedComponents
|
connectedComponents
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
|
def asConnectedComponent(group: (VertexId, Iterable[String]), cut:Int): ConnectedComponent = {
|
||||||
val docs = group._2.toSet[String]
|
val docs = group._2.toSet[String]
|
||||||
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
|
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs), cut);
|
||||||
connectedComponent
|
connectedComponent
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,12 @@
|
||||||
"paramDescription": "the url for the lookup service",
|
"paramDescription": "the url for the lookup service",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "cc",
|
||||||
|
"paramLongName": "cutConnectedComponent",
|
||||||
|
"paramDescription": "the number of maximum elements that belongs to a connected components",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "w",
|
"paramName": "w",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "workingPath",
|
||||||
|
|
|
@ -22,5 +22,11 @@
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "workingPath",
|
||||||
"paramDescription": "path of the working directory",
|
"paramDescription": "path of the working directory",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "np",
|
||||||
|
"paramLongName": "numPartitions",
|
||||||
|
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -20,6 +20,10 @@
|
||||||
<name>dedupGraphPath</name>
|
<name>dedupGraphPath</name>
|
||||||
<description>path for the output graph</description>
|
<description>path for the output graph</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>cutConnectedComponent</name>
|
||||||
|
<description>max number of elements in a connected component</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -106,10 +110,11 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--asi</arg><arg>${actionSetId}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--w</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--numPartitions</arg><arg>8000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CreateMergeRel"/>
|
<ok to="CreateMergeRel"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -132,10 +137,11 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--w</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--asi</arg><arg>${actionSetId}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
|
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CreateDedupRecord"/>
|
<ok to="CreateDedupRecord"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -158,10 +164,10 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--w</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--la</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--asi</arg><arg>${actionSetId}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="UpdateEntity"/>
|
<ok to="UpdateEntity"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -184,9 +190,9 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--i</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--w</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--o</arg><arg>${dedupGraphPath}</arg>
|
<arg>--dedupGraphPath</arg><arg>${dedupGraphPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="copyRelations"/>
|
<ok to="copyRelations"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "i",
|
"paramName": "i",
|
||||||
"paramLongName": "graphBasePath",
|
"paramLongName": "graphBasePath",
|
||||||
"paramDescription": "the base path of raw graph",
|
"paramDescription": "the base path of raw graph",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "w",
|
"paramName": "w",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "workingPath",
|
||||||
"paramDescription": "the working directory path",
|
"paramDescription": "the working directory path",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "dedupGraphPath",
|
"paramLongName": "dedupGraphPath",
|
||||||
"paramDescription": "the path of the dedup graph",
|
"paramDescription": "the path of the dedup graph",
|
||||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import static java.nio.file.Files.createTempDirectory;
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
import static org.apache.spark.sql.functions.count;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.mockito.Mockito.lenient;
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
@ -11,6 +13,9 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -18,6 +23,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -71,11 +77,13 @@ public class SparkDedupTest implements Serializable {
|
||||||
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||||
|
|
||||||
|
final SparkConf conf = new SparkConf();
|
||||||
|
conf.set("spark.sql.shuffle.partitions", "200");
|
||||||
spark = SparkSession
|
spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.appName(SparkDedupTest.class.getSimpleName())
|
.appName(SparkDedupTest.class.getSimpleName())
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.config(new SparkConf())
|
.config(conf)
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
|
|
||||||
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
@ -152,37 +160,42 @@ public class SparkDedupTest implements Serializable {
|
||||||
parser
|
parser
|
||||||
.parseArgument(
|
.parseArgument(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-i",
|
"-i", testGraphBasePath,
|
||||||
testGraphBasePath,
|
"-asi", testActionSetId,
|
||||||
"-asi",
|
"-la", "lookupurl",
|
||||||
testActionSetId,
|
"-w", testOutputBasePath,
|
||||||
"-la",
|
"-np", "50"
|
||||||
"lookupurl",
|
|
||||||
"-w",
|
|
||||||
testOutputBasePath
|
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCreateSimRels(parser, spark).run(isLookUpService);
|
new SparkCreateSimRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
long orgs_simrel = spark
|
long orgs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long pubs_simrel = spark
|
long pubs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
||||||
.count();
|
.count();
|
||||||
long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count();
|
|
||||||
|
|
||||||
long ds_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count();
|
long sw_simrel = spark
|
||||||
|
.read()
|
||||||
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long ds_simrel = spark
|
||||||
|
.read()
|
||||||
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
||||||
|
.count();
|
||||||
|
|
||||||
long orp_simrel = spark
|
long orp_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(3432, orgs_simrel);
|
assertEquals(3432, orgs_simrel);
|
||||||
assertEquals(7054, pubs_simrel);
|
assertEquals(7152, pubs_simrel);
|
||||||
assertEquals(344, sw_simrel);
|
assertEquals(344, sw_simrel);
|
||||||
assertEquals(458, ds_simrel);
|
assertEquals(458, ds_simrel);
|
||||||
assertEquals(6750, orp_simrel);
|
assertEquals(6750, orp_simrel);
|
||||||
|
@ -190,6 +203,100 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(2)
|
@Order(2)
|
||||||
|
public void cutMergeRelsTest() throws Exception {
|
||||||
|
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateMergeRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i",
|
||||||
|
testGraphBasePath,
|
||||||
|
"-asi",
|
||||||
|
testActionSetId,
|
||||||
|
"-la",
|
||||||
|
"lookupurl",
|
||||||
|
"-w",
|
||||||
|
testOutputBasePath,
|
||||||
|
"-cc",
|
||||||
|
"3"
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
long orgs_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||||
|
.groupBy("source")
|
||||||
|
.agg(count("target").alias("cnt"))
|
||||||
|
.select("source", "cnt")
|
||||||
|
.where("cnt > 3")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long pubs_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||||
|
.groupBy("source")
|
||||||
|
.agg(count("target").alias("cnt"))
|
||||||
|
.select("source", "cnt")
|
||||||
|
.where("cnt > 3")
|
||||||
|
.count();
|
||||||
|
long sw_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||||
|
.groupBy("source")
|
||||||
|
.agg(count("target").alias("cnt"))
|
||||||
|
.select("source", "cnt")
|
||||||
|
.where("cnt > 3")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long ds_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||||
|
.groupBy("source")
|
||||||
|
.agg(count("target").alias("cnt"))
|
||||||
|
.select("source", "cnt")
|
||||||
|
.where("cnt > 3")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long orp_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||||
|
.groupBy("source")
|
||||||
|
.agg(count("target").alias("cnt"))
|
||||||
|
.select("source", "cnt")
|
||||||
|
.where("cnt > 3")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
assertEquals(0, orgs_mergerel);
|
||||||
|
assertEquals(0, pubs_mergerel);
|
||||||
|
assertEquals(0, sw_mergerel);
|
||||||
|
assertEquals(0, ds_mergerel);
|
||||||
|
assertEquals(0, orp_mergerel);
|
||||||
|
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel"));
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel"));
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/software_mergerel"));
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel"));
|
||||||
|
FileUtils.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(3)
|
||||||
public void createMergeRelsTest() throws Exception {
|
public void createMergeRelsTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -225,8 +332,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
||||||
.count();
|
.count();
|
||||||
|
long ds_mergerel = spark
|
||||||
long ds_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count();
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
|
||||||
|
.count();
|
||||||
|
|
||||||
long orp_mergerel = spark
|
long orp_mergerel = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -234,14 +343,14 @@ public class SparkDedupTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(1276, orgs_mergerel);
|
assertEquals(1276, orgs_mergerel);
|
||||||
assertEquals(1440, pubs_mergerel);
|
assertEquals(1442, pubs_mergerel);
|
||||||
assertEquals(288, sw_mergerel);
|
assertEquals(288, sw_mergerel);
|
||||||
assertEquals(472, ds_mergerel);
|
assertEquals(472, ds_mergerel);
|
||||||
assertEquals(718, orp_mergerel);
|
assertEquals(718, orp_mergerel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(3)
|
@Order(4)
|
||||||
public void createDedupRecordTest() throws Exception {
|
public void createDedupRecordTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -288,7 +397,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(4)
|
@Order(5)
|
||||||
public void updateEntityTest() throws Exception {
|
public void updateEntityTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -404,7 +513,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(5)
|
@Order(6)
|
||||||
public void propagateRelationTest() throws Exception {
|
public void propagateRelationTest() throws Exception {
|
||||||
|
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
@ -423,7 +532,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||||
|
|
||||||
assertEquals(4971, relations);
|
assertEquals(4866, relations);
|
||||||
|
|
||||||
// check deletedbyinference
|
// check deletedbyinference
|
||||||
final Dataset<Relation> mergeRels = spark
|
final Dataset<Relation> mergeRels = spark
|
||||||
|
@ -454,7 +563,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(6)
|
@Order(7)
|
||||||
public void testRelations() throws Exception {
|
public void testRelations() throws Exception {
|
||||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
||||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
"subEntityType" : "resulttype",
|
"subEntityType" : "resulttype",
|
||||||
"subEntityValue" : "dataset",
|
"subEntityValue" : "dataset",
|
||||||
"orderField" : "title",
|
"orderField" : "title",
|
||||||
"queueMaxSize" : "800",
|
"queueMaxSize" : "100",
|
||||||
"groupMaxSize" : "100",
|
"groupMaxSize" : "100",
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "80",
|
"slidingWindowSize" : "100",
|
||||||
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true",
|
"includeChildren" : "true",
|
||||||
"idPath" : "$.id",
|
"idPath" : "$.id",
|
||||||
|
@ -17,7 +17,8 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
"subEntityType" : "resulttype",
|
"subEntityType" : "resulttype",
|
||||||
"subEntityValue" : "otherresearchproduct",
|
"subEntityValue" : "otherresearchproduct",
|
||||||
"orderField" : "title",
|
"orderField" : "title",
|
||||||
"queueMaxSize" : "800",
|
"queueMaxSize" : "100",
|
||||||
"groupMaxSize" : "100",
|
"groupMaxSize" : "100",
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "80",
|
"slidingWindowSize" : "100",
|
||||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true",
|
"includeChildren" : "true",
|
||||||
"idPath" : "$.id",
|
"idPath" : "$.id",
|
||||||
|
@ -17,7 +17,8 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
"subEntityType": "resulttype",
|
"subEntityType": "resulttype",
|
||||||
"subEntityValue": "publication",
|
"subEntityValue": "publication",
|
||||||
"orderField": "title",
|
"orderField": "title",
|
||||||
"queueMaxSize": "800",
|
"queueMaxSize": "100",
|
||||||
"groupMaxSize": "100",
|
"groupMaxSize": "100",
|
||||||
"maxChildren": "100",
|
"maxChildren": "100",
|
||||||
"slidingWindowSize": "80",
|
"slidingWindowSize": "100",
|
||||||
"rootBuilder": [
|
"rootBuilder": [
|
||||||
"result",
|
"result",
|
||||||
"resultProject_outcome_isProducedBy",
|
"resultProject_outcome_isProducedBy",
|
||||||
|
@ -29,7 +29,8 @@
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
"subEntityType" : "resulttype",
|
"subEntityType" : "resulttype",
|
||||||
"subEntityValue" : "software",
|
"subEntityValue" : "software",
|
||||||
"orderField" : "title",
|
"orderField" : "title",
|
||||||
"queueMaxSize" : "800",
|
"queueMaxSize" : "100",
|
||||||
"groupMaxSize" : "100",
|
"groupMaxSize" : "100",
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "80",
|
"slidingWindowSize" : "100",
|
||||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true",
|
"includeChildren" : "true",
|
||||||
"idPath" : "$.id",
|
"idPath" : "$.id",
|
||||||
|
@ -17,8 +17,9 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "wordssuffixprefix", "fields" : [ "title" ], "params" : { "max" : "2", "len" : "3" } },
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
"start": {
|
"start": {
|
||||||
|
|
Loading…
Reference in New Issue