forked from antonis.lempesis/dnet-hadoop
implemented relation with dataset
This commit is contained in:
parent
addaaa091f
commit
0594b92a6d
|
@ -25,8 +25,6 @@ import java.io.IOException;
|
||||||
public class SparkUpdateEntityJob {
|
public class SparkUpdateEntityJob {
|
||||||
|
|
||||||
final static String IDJSONPATH = "$.id";
|
final static String IDJSONPATH = "$.id";
|
||||||
final static String SOURCEJSONPATH = "$.source";
|
|
||||||
final static String TARGETJSONPATH = "$.target";
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
|
||||||
|
|
|
@ -159,6 +159,14 @@ public class SparkScholexplorerMergeEntitiesJob {
|
||||||
}
|
}
|
||||||
, Encoders.bean(Relation.class));
|
, Encoders.bean(Relation.class));
|
||||||
secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed");
|
secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed");
|
||||||
|
|
||||||
|
|
||||||
|
FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration());
|
||||||
|
|
||||||
|
|
||||||
|
fileSystem.delete(new Path(targetPath), true);
|
||||||
|
fileSystem.rename(new Path(targetPath+"_fixed"),new Path(targetPath));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
package eu.dnetlib.dhp.provision
|
||||||
|
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.apache.spark.sql.functions.{coalesce, col, count, lit}
|
||||||
|
|
||||||
|
object DatasetJoiner {
|
||||||
|
|
||||||
|
def startJoin(spark: SparkSession, relPath:String, targetPath:String) {
|
||||||
|
val relation = spark.read.load(relPath)
|
||||||
|
|
||||||
|
val relatedPublication = relation.where("target like '50%'").groupBy("source").agg(count("target").as("publication")).select(col("source"). alias("p_source"), col("publication"))
|
||||||
|
val relatedDataset = relation.where("target like '60%'").groupBy("source").agg(count("target").as("dataset")).select(col("source"). alias("d_source"), col("dataset"))
|
||||||
|
val relatedUnknown = relation.where("target like '70%'").groupBy("source").agg(count("target").as("unknown")).select(col("source"). alias("u_source"), col("unknown"))
|
||||||
|
val firstJoin = relatedPublication
|
||||||
|
.join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
|
||||||
|
.select(coalesce(col("p_source"), col("d_source")).alias("id"),
|
||||||
|
col("publication"),
|
||||||
|
col("dataset"))
|
||||||
|
.join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
|
||||||
|
.select(coalesce(col("u_source"), col("id")).alias("source"),
|
||||||
|
coalesce(col("publication"),lit(0)).alias("relatedPublication"),
|
||||||
|
coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
|
||||||
|
coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
|
||||||
|
)
|
||||||
|
firstJoin.write.mode("overwrite").save(targetPath)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -10,21 +10,21 @@ public class ProvisionUtil {
|
||||||
public final static String TARGETJSONPATH = "$.target";
|
public final static String TARGETJSONPATH = "$.target";
|
||||||
public final static String SOURCEJSONPATH = "$.source";
|
public final static String SOURCEJSONPATH = "$.source";
|
||||||
|
|
||||||
public static RelatedItemInfo getItemType(final String item, final String idPath) {
|
// public static RelatedItemInfo getItemType(final String item, final String idPath) {
|
||||||
String targetId = DHPUtils.getJPathString(idPath, item);
|
// String targetId = DHPUtils.getJPathString(idPath, item);
|
||||||
switch (StringUtils.substringBefore(targetId, "|")) {
|
// switch (StringUtils.substringBefore(targetId, "|")) {
|
||||||
case "50":
|
// case "50":
|
||||||
return new RelatedItemInfo().setRelatedPublication(1);
|
// return new RelatedItemInfo(null,0,1,0);
|
||||||
case "60":
|
// case "60":
|
||||||
return new RelatedItemInfo().setRelatedDataset(1);
|
// return new RelatedItemInfo(null,1,0,0);
|
||||||
case "70":
|
// case "70":
|
||||||
return new RelatedItemInfo().setRelatedUnknown(1);
|
// return new RelatedItemInfo(null,0,0,1);
|
||||||
default:
|
// default:
|
||||||
throw new RuntimeException("Unknonw target ID");
|
// throw new RuntimeException("Unknonw target ID");
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
|
||||||
public static Boolean isNotDeleted(final String item) {
|
public static Boolean isNotDeleted(final String item) {
|
||||||
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));
|
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));
|
||||||
|
|
|
@ -8,57 +8,53 @@ import java.io.Serializable;
|
||||||
|
|
||||||
public class RelatedItemInfo implements Serializable {
|
public class RelatedItemInfo implements Serializable {
|
||||||
|
|
||||||
private String id;
|
private String source;
|
||||||
|
|
||||||
private int relatedDataset = 0;
|
private long relatedDataset = 0;
|
||||||
|
|
||||||
private int relatedPublication = 0;
|
private long relatedPublication = 0;
|
||||||
|
|
||||||
private int relatedUnknown = 0;
|
private long relatedUnknown = 0;
|
||||||
|
|
||||||
|
public RelatedItemInfo() {
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public RelatedItemInfo setId(String id) {
|
public RelatedItemInfo(String source, long relatedDataset, long relatedPublication, long relatedUnknown) {
|
||||||
this.id = id;
|
this.source = source;
|
||||||
return this;
|
this.relatedDataset = relatedDataset;
|
||||||
|
this.relatedPublication = relatedPublication;
|
||||||
|
this.relatedUnknown = relatedUnknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RelatedItemInfo add(RelatedItemInfo other) {
|
public String getSource() {
|
||||||
if (other != null) {
|
return source;
|
||||||
relatedDataset += other.getRelatedDataset();
|
|
||||||
relatedPublication += other.getRelatedPublication();
|
|
||||||
relatedUnknown += other.getRelatedUnknown();
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedDataset() {
|
public void setSource(String source) {
|
||||||
|
this.source = source;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getRelatedDataset() {
|
||||||
return relatedDataset;
|
return relatedDataset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RelatedItemInfo setRelatedDataset(int relatedDataset) {
|
public void setRelatedDataset(long relatedDataset) {
|
||||||
this.relatedDataset = relatedDataset;
|
this.relatedDataset = relatedDataset;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedPublication() {
|
public long getRelatedPublication() {
|
||||||
return relatedPublication;
|
return relatedPublication;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RelatedItemInfo setRelatedPublication(int relatedPublication) {
|
public void setRelatedPublication(long relatedPublication) {
|
||||||
this.relatedPublication = relatedPublication;
|
this.relatedPublication = relatedPublication;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedUnknown() {
|
public long getRelatedUnknown() {
|
||||||
return relatedUnknown;
|
return relatedUnknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RelatedItemInfo setRelatedUnknown(int relatedUnknown) {
|
public void setRelatedUnknown(int relatedUnknown) {
|
||||||
this.relatedUnknown = relatedUnknown;
|
this.relatedUnknown = relatedUnknown;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,19 +1,22 @@
|
||||||
package eu.dnetlib.dhp.provision;
|
package eu.dnetlib.dhp.provision;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import net.minidev.json.JSONArray;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function2;
|
import org.apache.spark.api.java.function.Function2;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.Expression;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SparkExtractRelationCount is a spark job that takes in input relation RDD
|
* SparkExtractRelationCount is a spark job that takes in input relation RDD
|
||||||
|
@ -42,27 +45,34 @@ public class SparkExtractRelationCount {
|
||||||
|
|
||||||
final String relationPath = parser.get("relationPath");
|
final String relationPath = parser.get("relationPath");
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
sc.textFile(relationPath)
|
|
||||||
// We start to Filter the relation not deleted by Inference
|
|
||||||
.filter(ProvisionUtil::isNotDeleted)
|
|
||||||
// Then we create a PairRDD<String, RelatedItem>
|
DatasetJoiner.startJoin(spark, relationPath,workingDirPath + "/relatedItemCount");
|
||||||
.mapToPair((PairFunction<String, String, RelatedItemInfo>) f
|
|
||||||
-> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
|
|
||||||
//We reduce and sum the number of Relations
|
|
||||||
.reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
|
|
||||||
if (v1 == null && v2 == null)
|
// sc.textFile(relationPath)
|
||||||
return new RelatedItemInfo();
|
// // We start to Filter the relation not deleted by Inference
|
||||||
return v1 != null ? v1.add(v2) : v2;
|
// .filter(ProvisionUtil::isNotDeleted)
|
||||||
})
|
// // Then we create a PairRDD<String, RelatedItem>
|
||||||
//Set the source Id in RelatedItem object
|
// .mapToPair((PairFunction<String, String, RelatedItemInfo>) f
|
||||||
.map(k -> k._2().setId(k._1()))
|
// -> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
|
||||||
// Convert to JSON and save as TextFile
|
// //We reduce and sum the number of Relations
|
||||||
.map(k -> {
|
// .reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
// if (v1 == null && v2 == null)
|
||||||
return mapper.writeValueAsString(k);
|
// return new RelatedItemInfo();
|
||||||
}).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class);
|
// return v1 != null ? v1.add(v2) : v2;
|
||||||
|
// })
|
||||||
|
// //Set the source Id in RelatedItem object
|
||||||
|
// .map(k -> k._2().setId(k._1()))
|
||||||
|
// // Convert to JSON and save as TextFile
|
||||||
|
// .map(k -> {
|
||||||
|
// ObjectMapper mapper = new ObjectMapper();
|
||||||
|
// return mapper.writeValueAsString(k);
|
||||||
|
// }).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,22 @@
|
||||||
package eu.dnetlib.dhp.provision;
|
package eu.dnetlib.dhp.provision;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
import eu.dnetlib.dhp.provision.scholix.*;
|
||||||
import eu.dnetlib.dhp.provision.scholix.ScholixResource;
|
import eu.dnetlib.dhp.provision.scholix.summary.*;
|
||||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.*;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
|
||||||
|
import scala.Int;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -19,19 +25,34 @@ import java.util.Random;
|
||||||
|
|
||||||
public class SparkGenerateScholix {
|
public class SparkGenerateScholix {
|
||||||
|
|
||||||
private static final String jsonIDPath = "$.id";
|
|
||||||
private static final String sourceIDPath = "$.source";
|
|
||||||
private static final String targetIDPath = "$.target";
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")));
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("spark.sql.shuffle.partitions","4000");
|
||||||
|
// conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
|
// conf.registerKryoClasses(new Class[]{
|
||||||
|
// ScholixSummary.class,
|
||||||
|
// CollectedFromType.class,
|
||||||
|
// SchemeValue.class,
|
||||||
|
// TypedIdentifier.class,
|
||||||
|
// Typology.class,
|
||||||
|
// Relation.class,
|
||||||
|
// Scholix.class,
|
||||||
|
// ScholixCollectedFrom.class,
|
||||||
|
// ScholixEntityId.class,
|
||||||
|
// ScholixIdentifier.class,
|
||||||
|
// ScholixRelationship.class,
|
||||||
|
// ScholixResource.class
|
||||||
|
// });
|
||||||
|
|
||||||
|
|
||||||
final SparkSession spark = SparkSession
|
final SparkSession spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
|
.config(conf)
|
||||||
.appName(SparkExtractRelationCount.class.getSimpleName())
|
.appName(SparkExtractRelationCount.class.getSimpleName())
|
||||||
.master(parser.get("master"))
|
.master(parser.get("master"))
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
|
@ -42,51 +63,30 @@ public class SparkGenerateScholix {
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
final Dataset<ScholixSummary> scholixSummary = spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class));
|
||||||
|
final Dataset<Relation> rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
// final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted).repartition(4000);
|
|
||||||
final JavaPairRDD<String,ScholixResource> scholixSummary =
|
|
||||||
sc.textFile(workingDirPath + "/summary")
|
|
||||||
.flatMapToPair((PairFlatMapFunction<String, String, ScholixResource>) i -> {
|
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
|
||||||
final ScholixSummary summary = mapper.readValue(i, ScholixSummary.class);
|
|
||||||
ScholixResource tmp = ScholixResource.fromSummary(summary);
|
|
||||||
final List<Tuple2<String, ScholixResource>> result = new ArrayList<>();
|
|
||||||
for (int k = 0; k<10; k++)
|
|
||||||
result.add(new Tuple2<>(String.format("%s::%d", tmp.getDnetIdentifier(), k), tmp));
|
|
||||||
return result.iterator();
|
|
||||||
});
|
|
||||||
// scholixSummary.join(
|
|
||||||
// relationToExport
|
|
||||||
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
|
|
||||||
// .map(Tuple2::_2)
|
|
||||||
// .mapToPair(summaryRelation ->
|
|
||||||
// new Tuple2<>(
|
|
||||||
// DHPUtils.getJPathString(targetIDPath, summaryRelation._2()),
|
|
||||||
// Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2())))
|
|
||||||
//
|
|
||||||
// .map(t-> t._2().setTarget(new ScholixResource().setDnetIdentifier(t._1())))
|
|
||||||
// .map(s-> {
|
|
||||||
// ObjectMapper mapper = new ObjectMapper();
|
|
||||||
// return mapper.writeValueAsString(s);
|
|
||||||
// })
|
|
||||||
// .saveAsTextFile(workingDirPath + "/scholix", GzipCodec.class);
|
|
||||||
|
|
||||||
sc.textFile(workingDirPath + "/scholix")
|
Dataset<Scholix> firstJoin = scholixSummary.joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source")))
|
||||||
.mapToPair(t -> {
|
.map((MapFunction<Tuple2<ScholixSummary, Relation>, Scholix>) f -> Scholix.generateScholixWithSource(f._1(), f._2()), Encoders.bean(Scholix.class));
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
|
||||||
Scholix scholix = mapper.readValue(t, Scholix.class);
|
firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_1");
|
||||||
Random rand = new Random();
|
firstJoin = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class));
|
||||||
return new Tuple2<>(String.format("%s::%d",scholix.getTarget().getDnetIdentifier(), rand.nextInt(10)), scholix);
|
|
||||||
})
|
|
||||||
.join(scholixSummary)
|
|
||||||
.map(t-> {
|
Dataset<Scholix> scholix_final = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class));
|
||||||
Scholix item = t._2()._1().setTarget(t._2()._2());
|
|
||||||
item.generateIdentifier();
|
Dataset<ScholixResource> target = spark.read().load(workingDirPath+"/scholix_target").as(Encoders.bean(ScholixResource.class));
|
||||||
return item;
|
|
||||||
})
|
scholix_final.joinWith(target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner")
|
||||||
.map(s-> new ObjectMapper().writeValueAsString(s)).saveAsTextFile(workingDirPath + "/scholix_index", GzipCodec.class);
|
.map((MapFunction<Tuple2<Scholix, ScholixResource>, Scholix>) f -> {
|
||||||
|
final Scholix scholix = f._1();
|
||||||
|
final ScholixResource scholixTarget = f._2();
|
||||||
|
scholix.setTarget(scholixTarget);
|
||||||
|
scholix.generateIdentifier();
|
||||||
|
scholix.generatelinkPublisher();
|
||||||
|
return scholix;
|
||||||
|
}, Encoders.bean(Scholix.class)).repartition(5000).write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_index");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,19 @@
|
||||||
package eu.dnetlib.dhp.provision;
|
package eu.dnetlib.dhp.provision;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
@ -31,27 +36,53 @@ public class SparkGenerateSummary {
|
||||||
final String workingDirPath = parser.get("workingDirPath");
|
final String workingDirPath = parser.get("workingDirPath");
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
JavaPairRDD<String, String> relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
|
|
||||||
|
|
||||||
JavaPairRDD<String, String> entities =
|
Dataset<RelatedItemInfo> rInfo = spark.read().load(workingDirPath + "/relatedItemCount").as(Encoders.bean(RelatedItemInfo.class));
|
||||||
sc.textFile(graphPath + "/publication")
|
|
||||||
.filter(ProvisionUtil::isNotDeleted)
|
|
||||||
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
|
||||||
.union(
|
|
||||||
sc.textFile(graphPath + "/dataset")
|
|
||||||
.filter(ProvisionUtil::isNotDeleted)
|
|
||||||
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
|
||||||
)
|
|
||||||
.union(
|
|
||||||
sc.textFile(graphPath + "/unknown")
|
|
||||||
.filter(ProvisionUtil::isNotDeleted)
|
|
||||||
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
|
||||||
);
|
|
||||||
entities.join(relationCount).map((Function<Tuple2<String, Tuple2<String, String>>, String>) k ->
|
|
||||||
ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class);
|
|
||||||
|
|
||||||
|
|
||||||
;
|
Dataset<ScholixSummary> entity = spark.createDataset(sc.textFile(graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown")
|
||||||
|
.map(s ->
|
||||||
|
ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), s)
|
||||||
|
|
||||||
|
|
||||||
|
).rdd(), Encoders.bean(ScholixSummary.class));
|
||||||
|
|
||||||
|
|
||||||
|
Dataset<ScholixSummary> summaryComplete = rInfo.joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))).map((MapFunction<Tuple2<RelatedItemInfo, ScholixSummary>, ScholixSummary>) t ->
|
||||||
|
{
|
||||||
|
ScholixSummary scholixSummary = t._2();
|
||||||
|
RelatedItemInfo relatedItemInfo = t._1();
|
||||||
|
scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||||
|
scholixSummary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||||
|
scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
|
||||||
|
return scholixSummary;
|
||||||
|
}, Encoders.bean(ScholixSummary.class)
|
||||||
|
);
|
||||||
|
|
||||||
|
summaryComplete.write().save(workingDirPath+"/summary");
|
||||||
|
|
||||||
|
|
||||||
|
// JavaPairRDD<String, String> relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
|
||||||
|
//
|
||||||
|
// JavaPairRDD<String, String> entities =
|
||||||
|
// sc.textFile(graphPath + "/publication")
|
||||||
|
// .filter(ProvisionUtil::isNotDeleted)
|
||||||
|
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
||||||
|
// .union(
|
||||||
|
// sc.textFile(graphPath + "/dataset")
|
||||||
|
// .filter(ProvisionUtil::isNotDeleted)
|
||||||
|
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
||||||
|
// )
|
||||||
|
// .union(
|
||||||
|
// sc.textFile(graphPath + "/unknown")
|
||||||
|
// .filter(ProvisionUtil::isNotDeleted)
|
||||||
|
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
|
||||||
|
// );
|
||||||
|
// entities.join(relationCount).map((Function<Tuple2<String, Tuple2<String, String>>, String>) k ->
|
||||||
|
// ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class);
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// ;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
package eu.dnetlib.dhp.provision;
|
package eu.dnetlib.dhp.provision;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.provision.scholix.Scholix;
|
||||||
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
|
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
|
||||||
|
|
||||||
|
import java.nio.file.attribute.AclFileAttributeView;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -21,17 +28,30 @@ public class SparkIndexCollectionOnES {
|
||||||
SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName())
|
SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName())
|
||||||
.setMaster(parser.get("master"));
|
.setMaster(parser.get("master"));
|
||||||
|
|
||||||
|
conf.set("spark.sql.shuffle.partitions","4000");
|
||||||
|
|
||||||
|
|
||||||
final String sourcePath = parser.get("sourcePath");
|
final String sourcePath = parser.get("sourcePath");
|
||||||
final String index = parser.get("index");
|
final String index = parser.get("index");
|
||||||
final String idPath = parser.get("idPath");
|
final String idPath = parser.get("idPath");
|
||||||
|
final String type = parser.get("type");
|
||||||
|
|
||||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<String> inputRdd = sc.textFile(sourcePath);
|
JavaRDD<String> inputRdd;
|
||||||
|
|
||||||
|
|
||||||
|
if("summary".equalsIgnoreCase(type))
|
||||||
|
inputRdd = spark.read().load(sourcePath).as(Encoders.bean(ScholixSummary.class)).map((MapFunction<ScholixSummary, String>) f -> {
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
return mapper.writeValueAsString(f);
|
||||||
|
}, Encoders.STRING()).javaRDD();
|
||||||
|
|
||||||
|
else
|
||||||
|
inputRdd = sc.textFile(sourcePath);
|
||||||
|
|
||||||
Map<String, String> esCfg = new HashMap<>();
|
Map<String, String> esCfg = new HashMap<>();
|
||||||
esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
||||||
|
@ -40,8 +60,6 @@ public class SparkIndexCollectionOnES {
|
||||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||||
esCfg.put("es.batch.size.entries", "200");
|
esCfg.put("es.batch.size.entries", "200");
|
||||||
esCfg.put("es.nodes.wan.only", "true");
|
esCfg.put("es.nodes.wan.only", "true");
|
||||||
|
|
||||||
|
|
||||||
JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg);
|
JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,7 @@ import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Collections;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class Scholix implements Serializable {
|
public class Scholix implements Serializable {
|
||||||
|
@ -25,6 +24,20 @@ public class Scholix implements Serializable {
|
||||||
private String identifier;
|
private String identifier;
|
||||||
|
|
||||||
|
|
||||||
|
public Scholix clone(final ScholixResource t) {
|
||||||
|
final Scholix clone = new Scholix();
|
||||||
|
clone.setPublicationDate(publicationDate);
|
||||||
|
clone.setPublisher(publisher);
|
||||||
|
clone.setLinkprovider(linkprovider);
|
||||||
|
clone.setRelationship(relationship);
|
||||||
|
clone.setSource(source);
|
||||||
|
clone.setTarget(t);
|
||||||
|
clone.generatelinkPublisher();
|
||||||
|
clone.generateIdentifier();
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) {
|
public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
@ -46,8 +59,36 @@ public class Scholix implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Scholix generateScholixWithSource(final ScholixSummary scholixSummary, final Relation rel) {
|
||||||
|
final Scholix s = new Scholix();
|
||||||
|
if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0)
|
||||||
|
s.setPublicationDate(scholixSummary.getDate().get(0));
|
||||||
|
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
|
||||||
|
new ScholixEntityId(cf.getValue(), Collections.singletonList(
|
||||||
|
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
|
||||||
|
))).collect(Collectors.toList()));
|
||||||
|
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
|
||||||
|
s.setSource(ScholixResource.fromSummary(scholixSummary));
|
||||||
|
|
||||||
public void generateIdentifier( ) {
|
s.setIdentifier(rel.getTarget());
|
||||||
|
// ScholixResource mockTarget = new ScholixResource();
|
||||||
|
// mockTarget.setDnetIdentifier(rel.getTarget());
|
||||||
|
// s.setTarget(mockTarget);
|
||||||
|
// s.generateIdentifier();
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void generatelinkPublisher() {
|
||||||
|
Set<String> publisher = new HashSet<>();
|
||||||
|
if (source.getPublisher() != null)
|
||||||
|
publisher.addAll(source.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
|
||||||
|
if (target.getPublisher() != null)
|
||||||
|
publisher.addAll(target.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
|
||||||
|
this.publisher = publisher.stream().map(k -> new ScholixEntityId(k ,null)).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void generateIdentifier( ) {
|
||||||
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
|
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -65,67 +106,58 @@ public class Scholix implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getPublicationDate() {
|
public String getPublicationDate() {
|
||||||
return publicationDate;
|
return publicationDate;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setPublicationDate(String publicationDate) {
|
public void setPublicationDate(String publicationDate) {
|
||||||
this.publicationDate = publicationDate;
|
this.publicationDate = publicationDate;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixEntityId> getPublisher() {
|
public List<ScholixEntityId> getPublisher() {
|
||||||
return publisher;
|
return publisher;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setPublisher(List<ScholixEntityId> publisher) {
|
public void setPublisher(List<ScholixEntityId> publisher) {
|
||||||
this.publisher = publisher;
|
this.publisher = publisher;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixEntityId> getLinkprovider() {
|
public List<ScholixEntityId> getLinkprovider() {
|
||||||
return linkprovider;
|
return linkprovider;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setLinkprovider(List<ScholixEntityId> linkprovider) {
|
public void setLinkprovider(List<ScholixEntityId> linkprovider) {
|
||||||
this.linkprovider = linkprovider;
|
this.linkprovider = linkprovider;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixRelationship getRelationship() {
|
public ScholixRelationship getRelationship() {
|
||||||
return relationship;
|
return relationship;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setRelationship(ScholixRelationship relationship) {
|
public void setRelationship(ScholixRelationship relationship) {
|
||||||
this.relationship = relationship;
|
this.relationship = relationship;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource getSource() {
|
public ScholixResource getSource() {
|
||||||
return source;
|
return source;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setSource(ScholixResource source) {
|
public void setSource(ScholixResource source) {
|
||||||
this.source = source;
|
this.source = source;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource getTarget() {
|
public ScholixResource getTarget() {
|
||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Scholix setTarget(ScholixResource target) {
|
public void setTarget(ScholixResource target) {
|
||||||
this.target = target;
|
this.target = target;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getIdentifier() {
|
public String getIdentifier() {
|
||||||
return identifier;
|
return identifier;
|
||||||
}
|
}
|
||||||
|
public void setIdentifier(String identifier) {
|
||||||
public Scholix setIdentifier(String identifier) {
|
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,26 +21,23 @@ public class ScholixCollectedFrom implements Serializable {
|
||||||
return provider;
|
return provider;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixCollectedFrom setProvider(ScholixEntityId provider) {
|
public void setProvider(ScholixEntityId provider) {
|
||||||
this.provider = provider;
|
this.provider = provider;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getProvisionMode() {
|
public String getProvisionMode() {
|
||||||
return provisionMode;
|
return provisionMode;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixCollectedFrom setProvisionMode(String provisionMode) {
|
public void setProvisionMode(String provisionMode) {
|
||||||
this.provisionMode = provisionMode;
|
this.provisionMode = provisionMode;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getCompletionStatus() {
|
public String getCompletionStatus() {
|
||||||
return completionStatus;
|
return completionStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixCollectedFrom setCompletionStatus(String completionStatus) {
|
public void setCompletionStatus(String completionStatus) {
|
||||||
this.completionStatus = completionStatus;
|
this.completionStatus = completionStatus;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,17 +19,15 @@ public class ScholixEntityId implements Serializable {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixEntityId setName(String name) {
|
public void setName(String name) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixIdentifier> getIdentifiers() {
|
public List<ScholixIdentifier> getIdentifiers() {
|
||||||
return identifiers;
|
return identifiers;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixEntityId setIdentifiers(List<ScholixIdentifier> identifiers) {
|
public void setIdentifiers(List<ScholixIdentifier> identifiers) {
|
||||||
this.identifiers = identifiers;
|
this.identifiers = identifiers;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,17 +18,15 @@ public class ScholixIdentifier implements Serializable {
|
||||||
return identifier;
|
return identifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixIdentifier setIdentifier(String identifier) {
|
public void setIdentifier(String identifier) {
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getSchema() {
|
public String getSchema() {
|
||||||
return schema;
|
return schema;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixIdentifier setSchema(String schema) {
|
public void setSchema(String schema) {
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,26 +20,23 @@ public class ScholixRelationship implements Serializable {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixRelationship setName(String name) {
|
public void setName(String name) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getSchema() {
|
public String getSchema() {
|
||||||
return schema;
|
return schema;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixRelationship setSchema(String schema) {
|
public void setSchema(String schema) {
|
||||||
this.schema = schema;
|
this.schema = schema;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getInverse() {
|
public String getInverse() {
|
||||||
return inverse;
|
return inverse;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixRelationship setInverse(String inverse) {
|
public void setInverse(String inverse) {
|
||||||
this.inverse = inverse;
|
this.inverse = inverse;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,9 @@ public class ScholixResource implements Serializable {
|
||||||
private List<ScholixCollectedFrom> collectedFrom;
|
private List<ScholixCollectedFrom> collectedFrom;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static ScholixResource fromSummary(ScholixSummary summary) {
|
public static ScholixResource fromSummary(ScholixSummary summary) {
|
||||||
|
|
||||||
final ScholixResource resource = new ScholixResource();
|
final ScholixResource resource = new ScholixResource();
|
||||||
|
@ -66,80 +69,71 @@ public class ScholixResource implements Serializable {
|
||||||
return identifier;
|
return identifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setIdentifier(List<ScholixIdentifier> identifier) {
|
public void setIdentifier(List<ScholixIdentifier> identifier) {
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDnetIdentifier() {
|
public String getDnetIdentifier() {
|
||||||
return dnetIdentifier;
|
return dnetIdentifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setDnetIdentifier(String dnetIdentifier) {
|
public void setDnetIdentifier(String dnetIdentifier) {
|
||||||
this.dnetIdentifier = dnetIdentifier;
|
this.dnetIdentifier = dnetIdentifier;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getObjectType() {
|
public String getObjectType() {
|
||||||
return objectType;
|
return objectType;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setObjectType(String objectType) {
|
public void setObjectType(String objectType) {
|
||||||
this.objectType = objectType;
|
this.objectType = objectType;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getObjectSubType() {
|
public String getObjectSubType() {
|
||||||
return objectSubType;
|
return objectSubType;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setObjectSubType(String objectSubType) {
|
public void setObjectSubType(String objectSubType) {
|
||||||
this.objectSubType = objectSubType;
|
this.objectSubType = objectSubType;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setTitle(String title) {
|
public void setTitle(String title) {
|
||||||
this.title = title;
|
this.title = title;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixEntityId> getCreator() {
|
public List<ScholixEntityId> getCreator() {
|
||||||
return creator;
|
return creator;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setCreator(List<ScholixEntityId> creator) {
|
public void setCreator(List<ScholixEntityId> creator) {
|
||||||
this.creator = creator;
|
this.creator = creator;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getPublicationDate() {
|
public String getPublicationDate() {
|
||||||
return publicationDate;
|
return publicationDate;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setPublicationDate(String publicationDate) {
|
public void setPublicationDate(String publicationDate) {
|
||||||
this.publicationDate = publicationDate;
|
this.publicationDate = publicationDate;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixEntityId> getPublisher() {
|
public List<ScholixEntityId> getPublisher() {
|
||||||
return publisher;
|
return publisher;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setPublisher(List<ScholixEntityId> publisher) {
|
public void setPublisher(List<ScholixEntityId> publisher) {
|
||||||
this.publisher = publisher;
|
this.publisher = publisher;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ScholixCollectedFrom> getCollectedFrom() {
|
public List<ScholixCollectedFrom> getCollectedFrom() {
|
||||||
return collectedFrom;
|
return collectedFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
|
public void setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
|
||||||
this.collectedFrom = collectedFrom;
|
this.collectedFrom = collectedFrom;
|
||||||
return this;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -24,9 +25,9 @@ public class ScholixSummary implements Serializable {
|
||||||
private String description;
|
private String description;
|
||||||
private List<SchemeValue> subject;
|
private List<SchemeValue> subject;
|
||||||
private List<String> publisher;
|
private List<String> publisher;
|
||||||
private int relatedPublications;
|
private long relatedPublications;
|
||||||
private int relatedDatasets;
|
private long relatedDatasets;
|
||||||
private int relatedUnknown;
|
private long relatedUnknown;
|
||||||
private List<CollectedFromType> datasources;
|
private List<CollectedFromType> datasources;
|
||||||
|
|
||||||
|
|
||||||
|
@ -104,27 +105,27 @@ public class ScholixSummary implements Serializable {
|
||||||
this.publisher = publisher;
|
this.publisher = publisher;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedPublications() {
|
public long getRelatedPublications() {
|
||||||
return relatedPublications;
|
return relatedPublications;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setRelatedPublications(int relatedPublications) {
|
public void setRelatedPublications(long relatedPublications) {
|
||||||
this.relatedPublications = relatedPublications;
|
this.relatedPublications = relatedPublications;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedDatasets() {
|
public long getRelatedDatasets() {
|
||||||
return relatedDatasets;
|
return relatedDatasets;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setRelatedDatasets(int relatedDatasets) {
|
public void setRelatedDatasets(long relatedDatasets) {
|
||||||
this.relatedDatasets = relatedDatasets;
|
this.relatedDatasets = relatedDatasets;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRelatedUnknown() {
|
public long getRelatedUnknown() {
|
||||||
return relatedUnknown;
|
return relatedUnknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setRelatedUnknown(int relatedUnknown) {
|
public void setRelatedUnknown(long relatedUnknown) {
|
||||||
this.relatedUnknown = relatedUnknown;
|
this.relatedUnknown = relatedUnknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,6 +138,25 @@ public class ScholixSummary implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) {
|
||||||
|
try {
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
final RelatedItemInfo relatedItemInfo = new RelatedItemInfo();
|
||||||
|
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
switch (oafType) {
|
||||||
|
case dataset:
|
||||||
|
return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo);
|
||||||
|
case publication:
|
||||||
|
return summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo);
|
||||||
|
case unknown:
|
||||||
|
return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo);
|
||||||
|
}
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) {
|
public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) {
|
||||||
try {
|
try {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
@ -197,7 +217,8 @@ public class ScholixSummary implements Serializable {
|
||||||
.collect(Collectors.toList())
|
.collect(Collectors.toList())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (item.getPublisher()!= null)
|
||||||
|
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
|
||||||
|
|
||||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||||
|
@ -208,12 +229,10 @@ public class ScholixSummary implements Serializable {
|
||||||
.map(
|
.map(
|
||||||
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
|
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
|
||||||
).collect(Collectors.toList()));
|
).collect(Collectors.toList()));
|
||||||
|
|
||||||
|
|
||||||
return summary;
|
return summary;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ScholixSummary summaryFromPublication(final DLIPublication item, final RelatedItemInfo relatedItemInfo) {
|
private static ScholixSummary summaryFromPublication(final DLIPublication item, final RelatedItemInfo relatedItemInfo) {
|
||||||
ScholixSummary summary = new ScholixSummary();
|
ScholixSummary summary = new ScholixSummary();
|
||||||
summary.setId(item.getId());
|
summary.setId(item.getId());
|
||||||
|
|
||||||
|
@ -249,6 +268,9 @@ public class ScholixSummary implements Serializable {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (item.getPublisher()!= null)
|
||||||
|
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
|
||||||
|
|
||||||
|
|
||||||
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
|
||||||
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
|
||||||
|
@ -264,7 +286,7 @@ public class ScholixSummary implements Serializable {
|
||||||
return summary;
|
return summary;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ScholixSummary summaryFromUnknown(final DLIUnknown item, final RelatedItemInfo relatedItemInfo) {
|
private static ScholixSummary summaryFromUnknown(final DLIUnknown item, final RelatedItemInfo relatedItemInfo) {
|
||||||
ScholixSummary summary = new ScholixSummary();
|
ScholixSummary summary = new ScholixSummary();
|
||||||
summary.setId(item.getId());
|
summary.setId(item.getId());
|
||||||
if (item.getPid() != null)
|
if (item.getPid() != null)
|
||||||
|
|
|
@ -83,7 +83,25 @@
|
||||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="indexSummary"/>
|
<ok to="generateScholix"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="generateScholix">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>generate Scholix</name>
|
||||||
|
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
|
||||||
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory 9G --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||||
|
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="indexScholix"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -96,36 +114,17 @@
|
||||||
<name>generate Summary</name>
|
<name>generate Summary</name>
|
||||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --num-executors 20 --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} </spark-opts>
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="64" </spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
||||||
<arg>--index</arg><arg>${index}_object</arg>
|
<arg>--index</arg><arg>${index}_object</arg>
|
||||||
</spark>
|
<arg>--idPath</arg><arg>id</arg>
|
||||||
<ok to="generateScholix"/>
|
<arg>--type</arg><arg>summary</arg>
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<action name="generateScholix">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>generate Scholix</name>
|
|
||||||
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
|
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
|
||||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
|
||||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="indexScholix"/>
|
<ok to="indexScholix"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="indexScholix">
|
<action name="indexScholix">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
@ -135,15 +134,16 @@
|
||||||
<name>index scholix</name>
|
<name>index scholix</name>
|
||||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --num-executors 20 --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="16" </spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_index</arg>
|
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
|
||||||
<arg>--index</arg><arg>${index}_scholix</arg>
|
<arg>--index</arg><arg>${index}_scholix</arg>
|
||||||
|
<arg>--idPath</arg><arg>identifier</arg>
|
||||||
|
<arg>--type</arg><arg>scholix</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -17,6 +17,13 @@
|
||||||
"paramDescription": "the index name",
|
"paramDescription": "the index name",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "type",
|
||||||
|
"paramDescription": "should be scholix or summary",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "id",
|
"paramName": "id",
|
||||||
"paramLongName": "idPath",
|
"paramLongName": "idPath",
|
||||||
|
|
|
@ -0,0 +1,331 @@
|
||||||
|
{
|
||||||
|
"mappings": {
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"linkprovider": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"identifiers": {
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"publicationDate": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"relationship": {
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"collectedFrom": {
|
||||||
|
"properties": {
|
||||||
|
"completionStatus": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"provider": {
|
||||||
|
"properties": {
|
||||||
|
"identifiers": {
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"provisionMode": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"creator": {
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dnetIdentifier": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"identifier": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"objectType": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"publicationDate": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"publisher": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"target": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"collectedFrom": {
|
||||||
|
"properties": {
|
||||||
|
"completionStatus": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"provider": {
|
||||||
|
"properties": {
|
||||||
|
"identifiers": {
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"provisionMode": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"creator": {
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dnetIdentifier": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"identifier": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"identifier": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"schema": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"objectType": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"publicationDate": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"publisher": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"settings": {
|
||||||
|
"index": {
|
||||||
|
"refresh_interval": "600s",
|
||||||
|
"number_of_shards": "48",
|
||||||
|
"translog": {
|
||||||
|
"sync_interval": "15s",
|
||||||
|
"durability": "ASYNC"
|
||||||
|
},
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"analyzer_keyword": {
|
||||||
|
"filter": "lowercase",
|
||||||
|
"tokenizer": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
{
|
||||||
|
"mappings": {
|
||||||
|
"properties": {
|
||||||
|
"abstract": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"author": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"datasources": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"completionStatus": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"datasourceId": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"datasourceName": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"date": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"id": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"localIdentifier": {
|
||||||
|
"type": "nested",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"publisher": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"relatedDatasets": {
|
||||||
|
"type": "long"
|
||||||
|
},
|
||||||
|
"relatedPublications": {
|
||||||
|
"type": "long"
|
||||||
|
},
|
||||||
|
"relatedUnknown": {
|
||||||
|
"type": "long"
|
||||||
|
},
|
||||||
|
"subject": {
|
||||||
|
"properties": {
|
||||||
|
"scheme": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"typology": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"settings": {
|
||||||
|
"index": {
|
||||||
|
"refresh_interval": "600s",
|
||||||
|
"number_of_shards": "48",
|
||||||
|
"translog": {
|
||||||
|
"sync_interval": "15s",
|
||||||
|
"durability": "ASYNC"
|
||||||
|
},
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"analyzer_keyword": {
|
||||||
|
"filter": "lowercase",
|
||||||
|
"tokenizer": "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -12,12 +12,10 @@ import scala.Tuple2;
|
||||||
|
|
||||||
public class ExtractInfoTest {
|
public class ExtractInfoTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test() throws Exception {
|
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream("record.json"));
|
|
||||||
ProvisionUtil.getItemType(json,ProvisionUtil.TARGETJSONPATH);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -36,23 +34,20 @@ public class ExtractInfoTest {
|
||||||
public void testScholix() throws Exception {
|
public void testScholix() throws Exception {
|
||||||
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
|
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
|
||||||
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
|
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
|
||||||
|
|
||||||
Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
|
Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
|
||||||
public void testIndex() throws Exception {
|
public void testIndex() throws Exception {
|
||||||
SparkIndexCollectionOnES.main(
|
SparkGenerateScholix.main(
|
||||||
|
|
||||||
new String[] {
|
new String[] {
|
||||||
"-mt", "local[*]",
|
"-mt", "local[*]",
|
||||||
"-s", "/home/sandro/dli",
|
"-w", "/Users/sandro/Downloads/scholix/provision",
|
||||||
"-i", "dli_object"
|
"-g", "/Users/sandro/Downloads/scholix/graph"
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue