implemented relation with dataset

This commit is contained in:
Sandro La Bruzzo 2020-03-19 11:11:07 +01:00
parent addaaa091f
commit 0594b92a6d
21 changed files with 847 additions and 254 deletions

View File

@ -25,8 +25,6 @@ import java.io.IOException;
public class SparkUpdateEntityJob { public class SparkUpdateEntityJob {
final static String IDJSONPATH = "$.id"; final static String IDJSONPATH = "$.id";
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));

View File

@ -159,6 +159,14 @@ public class SparkScholexplorerMergeEntitiesJob {
} }
, Encoders.bean(Relation.class)); , Encoders.bean(Relation.class));
secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed"); secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed");
FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration());
fileSystem.delete(new Path(targetPath), true);
fileSystem.rename(new Path(targetPath+"_fixed"),new Path(targetPath));
} }
} }

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.provision
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{coalesce, col, count, lit}
object DatasetJoiner {
def startJoin(spark: SparkSession, relPath:String, targetPath:String) {
val relation = spark.read.load(relPath)
val relatedPublication = relation.where("target like '50%'").groupBy("source").agg(count("target").as("publication")).select(col("source"). alias("p_source"), col("publication"))
val relatedDataset = relation.where("target like '60%'").groupBy("source").agg(count("target").as("dataset")).select(col("source"). alias("d_source"), col("dataset"))
val relatedUnknown = relation.where("target like '70%'").groupBy("source").agg(count("target").as("unknown")).select(col("source"). alias("u_source"), col("unknown"))
val firstJoin = relatedPublication
.join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
.select(coalesce(col("p_source"), col("d_source")).alias("id"),
col("publication"),
col("dataset"))
.join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
.select(coalesce(col("u_source"), col("id")).alias("source"),
coalesce(col("publication"),lit(0)).alias("relatedPublication"),
coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
)
firstJoin.write.mode("overwrite").save(targetPath)
}
}

View File

@ -10,21 +10,21 @@ public class ProvisionUtil {
public final static String TARGETJSONPATH = "$.target"; public final static String TARGETJSONPATH = "$.target";
public final static String SOURCEJSONPATH = "$.source"; public final static String SOURCEJSONPATH = "$.source";
public static RelatedItemInfo getItemType(final String item, final String idPath) { // public static RelatedItemInfo getItemType(final String item, final String idPath) {
String targetId = DHPUtils.getJPathString(idPath, item); // String targetId = DHPUtils.getJPathString(idPath, item);
switch (StringUtils.substringBefore(targetId, "|")) { // switch (StringUtils.substringBefore(targetId, "|")) {
case "50": // case "50":
return new RelatedItemInfo().setRelatedPublication(1); // return new RelatedItemInfo(null,0,1,0);
case "60": // case "60":
return new RelatedItemInfo().setRelatedDataset(1); // return new RelatedItemInfo(null,1,0,0);
case "70": // case "70":
return new RelatedItemInfo().setRelatedUnknown(1); // return new RelatedItemInfo(null,0,0,1);
default: // default:
throw new RuntimeException("Unknonw target ID"); // throw new RuntimeException("Unknonw target ID");
//
} // }
//
} // }
public static Boolean isNotDeleted(final String item) { public static Boolean isNotDeleted(final String item) {
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));

View File

@ -8,57 +8,53 @@ import java.io.Serializable;
public class RelatedItemInfo implements Serializable { public class RelatedItemInfo implements Serializable {
private String id; private String source;
private int relatedDataset = 0; private long relatedDataset = 0;
private int relatedPublication = 0; private long relatedPublication = 0;
private int relatedUnknown = 0; private long relatedUnknown = 0;
public RelatedItemInfo() {
public String getId() {
return id;
} }
public RelatedItemInfo setId(String id) { public RelatedItemInfo(String source, long relatedDataset, long relatedPublication, long relatedUnknown) {
this.id = id; this.source = source;
return this; this.relatedDataset = relatedDataset;
this.relatedPublication = relatedPublication;
this.relatedUnknown = relatedUnknown;
} }
public RelatedItemInfo add(RelatedItemInfo other) { public String getSource() {
if (other != null) { return source;
relatedDataset += other.getRelatedDataset();
relatedPublication += other.getRelatedPublication();
relatedUnknown += other.getRelatedUnknown();
}
return this;
} }
public int getRelatedDataset() { public void setSource(String source) {
this.source = source;
}
public long getRelatedDataset() {
return relatedDataset; return relatedDataset;
} }
public RelatedItemInfo setRelatedDataset(int relatedDataset) { public void setRelatedDataset(long relatedDataset) {
this.relatedDataset = relatedDataset; this.relatedDataset = relatedDataset;
return this;
} }
public int getRelatedPublication() { public long getRelatedPublication() {
return relatedPublication; return relatedPublication;
} }
public RelatedItemInfo setRelatedPublication(int relatedPublication) { public void setRelatedPublication(long relatedPublication) {
this.relatedPublication = relatedPublication; this.relatedPublication = relatedPublication;
return this;
} }
public int getRelatedUnknown() { public long getRelatedUnknown() {
return relatedUnknown; return relatedUnknown;
} }
public RelatedItemInfo setRelatedUnknown(int relatedUnknown) { public void setRelatedUnknown(int relatedUnknown) {
this.relatedUnknown = relatedUnknown; this.relatedUnknown = relatedUnknown;
return this;
} }
} }

View File

@ -1,19 +1,22 @@
package eu.dnetlib.dhp.provision; package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import net.minidev.json.JSONArray;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.expressions.Expression;
import scala.Tuple2; import scala.Tuple2;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/** /**
* SparkExtractRelationCount is a spark job that takes in input relation RDD * SparkExtractRelationCount is a spark job that takes in input relation RDD
@ -42,27 +45,34 @@ public class SparkExtractRelationCount {
final String relationPath = parser.get("relationPath"); final String relationPath = parser.get("relationPath");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
sc.textFile(relationPath)
// We start to Filter the relation not deleted by Inference
.filter(ProvisionUtil::isNotDeleted)
// Then we create a PairRDD<String, RelatedItem> DatasetJoiner.startJoin(spark, relationPath,workingDirPath + "/relatedItemCount");
.mapToPair((PairFunction<String, String, RelatedItemInfo>) f
-> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
//We reduce and sum the number of Relations
.reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
if (v1 == null && v2 == null) // sc.textFile(relationPath)
return new RelatedItemInfo(); // // We start to Filter the relation not deleted by Inference
return v1 != null ? v1.add(v2) : v2; // .filter(ProvisionUtil::isNotDeleted)
}) // // Then we create a PairRDD<String, RelatedItem>
//Set the source Id in RelatedItem object // .mapToPair((PairFunction<String, String, RelatedItemInfo>) f
.map(k -> k._2().setId(k._1())) // -> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
// Convert to JSON and save as TextFile // //We reduce and sum the number of Relations
.map(k -> { // .reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
ObjectMapper mapper = new ObjectMapper(); // if (v1 == null && v2 == null)
return mapper.writeValueAsString(k); // return new RelatedItemInfo();
}).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class); // return v1 != null ? v1.add(v2) : v2;
// })
// //Set the source Id in RelatedItem object
// .map(k -> k._2().setId(k._1()))
// // Convert to JSON and save as TextFile
// .map(k -> {
// ObjectMapper mapper = new ObjectMapper();
// return mapper.writeValueAsString(k);
// }).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class);
} }

View File

@ -1,16 +1,22 @@
package eu.dnetlib.dhp.provision; package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.Scholix; import eu.dnetlib.dhp.provision.scholix.*;
import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.provision.scholix.summary.*;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.*;
import static org.apache.spark.sql.functions.col;
import scala.Int;
import scala.Tuple2; import scala.Tuple2;
import java.util.ArrayList; import java.util.ArrayList;
@ -19,19 +25,34 @@ import java.util.Random;
public class SparkGenerateScholix { public class SparkGenerateScholix {
private static final String jsonIDPath = "$.id";
private static final String sourceIDPath = "$.source";
private static final String targetIDPath = "$.target";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions","4000");
// conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
// conf.registerKryoClasses(new Class[]{
// ScholixSummary.class,
// CollectedFromType.class,
// SchemeValue.class,
// TypedIdentifier.class,
// Typology.class,
// Relation.class,
// Scholix.class,
// ScholixCollectedFrom.class,
// ScholixEntityId.class,
// ScholixIdentifier.class,
// ScholixRelationship.class,
// ScholixResource.class
// });
final SparkSession spark = SparkSession final SparkSession spark = SparkSession
.builder() .builder()
.config(conf)
.appName(SparkExtractRelationCount.class.getSimpleName()) .appName(SparkExtractRelationCount.class.getSimpleName())
.master(parser.get("master")) .master(parser.get("master"))
.getOrCreate(); .getOrCreate();
@ -42,51 +63,30 @@ public class SparkGenerateScholix {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final Dataset<ScholixSummary> scholixSummary = spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class));
final Dataset<Relation> rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class));
// final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted).repartition(4000);
final JavaPairRDD<String,ScholixResource> scholixSummary =
sc.textFile(workingDirPath + "/summary")
.flatMapToPair((PairFlatMapFunction<String, String, ScholixResource>) i -> {
final ObjectMapper mapper = new ObjectMapper();
final ScholixSummary summary = mapper.readValue(i, ScholixSummary.class);
ScholixResource tmp = ScholixResource.fromSummary(summary);
final List<Tuple2<String, ScholixResource>> result = new ArrayList<>();
for (int k = 0; k<10; k++)
result.add(new Tuple2<>(String.format("%s::%d", tmp.getDnetIdentifier(), k), tmp));
return result.iterator();
});
// scholixSummary.join(
// relationToExport
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
// .map(Tuple2::_2)
// .mapToPair(summaryRelation ->
// new Tuple2<>(
// DHPUtils.getJPathString(targetIDPath, summaryRelation._2()),
// Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2())))
//
// .map(t-> t._2().setTarget(new ScholixResource().setDnetIdentifier(t._1())))
// .map(s-> {
// ObjectMapper mapper = new ObjectMapper();
// return mapper.writeValueAsString(s);
// })
// .saveAsTextFile(workingDirPath + "/scholix", GzipCodec.class);
sc.textFile(workingDirPath + "/scholix") Dataset<Scholix> firstJoin = scholixSummary.joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source")))
.mapToPair(t -> { .map((MapFunction<Tuple2<ScholixSummary, Relation>, Scholix>) f -> Scholix.generateScholixWithSource(f._1(), f._2()), Encoders.bean(Scholix.class));
ObjectMapper mapper = new ObjectMapper();
Scholix scholix = mapper.readValue(t, Scholix.class); firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_1");
Random rand = new Random(); firstJoin = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class));
return new Tuple2<>(String.format("%s::%d",scholix.getTarget().getDnetIdentifier(), rand.nextInt(10)), scholix);
})
.join(scholixSummary)
.map(t-> { Dataset<Scholix> scholix_final = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class));
Scholix item = t._2()._1().setTarget(t._2()._2());
item.generateIdentifier(); Dataset<ScholixResource> target = spark.read().load(workingDirPath+"/scholix_target").as(Encoders.bean(ScholixResource.class));
return item;
}) scholix_final.joinWith(target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner")
.map(s-> new ObjectMapper().writeValueAsString(s)).saveAsTextFile(workingDirPath + "/scholix_index", GzipCodec.class); .map((MapFunction<Tuple2<Scholix, ScholixResource>, Scholix>) f -> {
final Scholix scholix = f._1();
final ScholixResource scholixTarget = f._2();
scholix.setTarget(scholixTarget);
scholix.generateIdentifier();
scholix.generatelinkPublisher();
return scholix;
}, Encoders.bean(Scholix.class)).repartition(5000).write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_index");
} }
} }

View File

@ -1,14 +1,19 @@
package eu.dnetlib.dhp.provision; package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import scala.Tuple2; import scala.Tuple2;
@ -31,27 +36,53 @@ public class SparkGenerateSummary {
final String workingDirPath = parser.get("workingDirPath"); final String workingDirPath = parser.get("workingDirPath");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaPairRDD<String, String> relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
JavaPairRDD<String, String> entities = Dataset<RelatedItemInfo> rInfo = spark.read().load(workingDirPath + "/relatedItemCount").as(Encoders.bean(RelatedItemInfo.class));
sc.textFile(graphPath + "/publication")
.filter(ProvisionUtil::isNotDeleted)
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) Dataset<ScholixSummary> entity = spark.createDataset(sc.textFile(graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown")
.union( .map(s ->
sc.textFile(graphPath + "/dataset") ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), s)
.filter(ProvisionUtil::isNotDeleted)
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
) ).rdd(), Encoders.bean(ScholixSummary.class));
.union(
sc.textFile(graphPath + "/unknown")
.filter(ProvisionUtil::isNotDeleted) Dataset<ScholixSummary> summaryComplete = rInfo.joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))).map((MapFunction<Tuple2<RelatedItemInfo, ScholixSummary>, ScholixSummary>) t ->
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) {
ScholixSummary scholixSummary = t._2();
RelatedItemInfo relatedItemInfo = t._1();
scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
scholixSummary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
return scholixSummary;
}, Encoders.bean(ScholixSummary.class)
); );
entities.join(relationCount).map((Function<Tuple2<String, Tuple2<String, String>>, String>) k ->
ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); summaryComplete.write().save(workingDirPath+"/summary");
; // JavaPairRDD<String, String> relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
//
// JavaPairRDD<String, String> entities =
// sc.textFile(graphPath + "/publication")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// .union(
// sc.textFile(graphPath + "/dataset")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// )
// .union(
// sc.textFile(graphPath + "/unknown")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// );
// entities.join(relationCount).map((Function<Tuple2<String, Tuple2<String, String>>, String>) k ->
// ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class);
//
//
// ;
} }
} }

View File

@ -1,13 +1,20 @@
package eu.dnetlib.dhp.provision; package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import java.nio.file.attribute.AclFileAttributeView;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -21,17 +28,30 @@ public class SparkIndexCollectionOnES {
SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName()) SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster(parser.get("master")); .setMaster(parser.get("master"));
conf.set("spark.sql.shuffle.partitions","4000");
final String sourcePath = parser.get("sourcePath"); final String sourcePath = parser.get("sourcePath");
final String index = parser.get("index"); final String index = parser.get("index");
final String idPath = parser.get("idPath"); final String idPath = parser.get("idPath");
final String type = parser.get("type");
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> inputRdd = sc.textFile(sourcePath); JavaRDD<String> inputRdd;
if("summary".equalsIgnoreCase(type))
inputRdd = spark.read().load(sourcePath).as(Encoders.bean(ScholixSummary.class)).map((MapFunction<ScholixSummary, String>) f -> {
final ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(f);
}, Encoders.STRING()).javaRDD();
else
inputRdd = sc.textFile(sourcePath);
Map<String, String> esCfg = new HashMap<>(); Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
@ -40,8 +60,6 @@ public class SparkIndexCollectionOnES {
esCfg.put("es.batch.write.retry.wait", "60s"); esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200"); esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true"); esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg); JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg);
} }

View File

@ -5,8 +5,7 @@ import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.Serializable; import java.io.Serializable;
import java.util.Collections; import java.util.*;
import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class Scholix implements Serializable { public class Scholix implements Serializable {
@ -25,6 +24,20 @@ public class Scholix implements Serializable {
private String identifier; private String identifier;
public Scholix clone(final ScholixResource t) {
final Scholix clone = new Scholix();
clone.setPublicationDate(publicationDate);
clone.setPublisher(publisher);
clone.setLinkprovider(linkprovider);
clone.setRelationship(relationship);
clone.setSource(source);
clone.setTarget(t);
clone.generatelinkPublisher();
clone.generateIdentifier();
return clone;
}
public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) { public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
@ -46,6 +59,34 @@ public class Scholix implements Serializable {
} }
} }
public static Scholix generateScholixWithSource(final ScholixSummary scholixSummary, final Relation rel) {
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
new ScholixEntityId(cf.getValue(), Collections.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
))).collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
s.setSource(ScholixResource.fromSummary(scholixSummary));
s.setIdentifier(rel.getTarget());
// ScholixResource mockTarget = new ScholixResource();
// mockTarget.setDnetIdentifier(rel.getTarget());
// s.setTarget(mockTarget);
// s.generateIdentifier();
return s;
}
public void generatelinkPublisher() {
Set<String> publisher = new HashSet<>();
if (source.getPublisher() != null)
publisher.addAll(source.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
if (target.getPublisher() != null)
publisher.addAll(target.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
this.publisher = publisher.stream().map(k -> new ScholixEntityId(k ,null)).collect(Collectors.toList());
}
public void generateIdentifier( ) { public void generateIdentifier( ) {
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier()))); setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
@ -65,67 +106,58 @@ public class Scholix implements Serializable {
} }
} }
public String getPublicationDate() { public String getPublicationDate() {
return publicationDate; return publicationDate;
} }
public Scholix setPublicationDate(String publicationDate) { public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate; this.publicationDate = publicationDate;
return this;
} }
public List<ScholixEntityId> getPublisher() { public List<ScholixEntityId> getPublisher() {
return publisher; return publisher;
} }
public Scholix setPublisher(List<ScholixEntityId> publisher) { public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher; this.publisher = publisher;
return this;
} }
public List<ScholixEntityId> getLinkprovider() { public List<ScholixEntityId> getLinkprovider() {
return linkprovider; return linkprovider;
} }
public Scholix setLinkprovider(List<ScholixEntityId> linkprovider) { public void setLinkprovider(List<ScholixEntityId> linkprovider) {
this.linkprovider = linkprovider; this.linkprovider = linkprovider;
return this;
} }
public ScholixRelationship getRelationship() { public ScholixRelationship getRelationship() {
return relationship; return relationship;
} }
public Scholix setRelationship(ScholixRelationship relationship) { public void setRelationship(ScholixRelationship relationship) {
this.relationship = relationship; this.relationship = relationship;
return this;
} }
public ScholixResource getSource() { public ScholixResource getSource() {
return source; return source;
} }
public Scholix setSource(ScholixResource source) { public void setSource(ScholixResource source) {
this.source = source; this.source = source;
return this;
} }
public ScholixResource getTarget() { public ScholixResource getTarget() {
return target; return target;
} }
public Scholix setTarget(ScholixResource target) { public void setTarget(ScholixResource target) {
this.target = target; this.target = target;
return this;
} }
public String getIdentifier() { public String getIdentifier() {
return identifier; return identifier;
} }
public void setIdentifier(String identifier) {
public Scholix setIdentifier(String identifier) {
this.identifier = identifier; this.identifier = identifier;
return this;
} }
} }

View File

@ -21,26 +21,23 @@ public class ScholixCollectedFrom implements Serializable {
return provider; return provider;
} }
public ScholixCollectedFrom setProvider(ScholixEntityId provider) { public void setProvider(ScholixEntityId provider) {
this.provider = provider; this.provider = provider;
return this;
} }
public String getProvisionMode() { public String getProvisionMode() {
return provisionMode; return provisionMode;
} }
public ScholixCollectedFrom setProvisionMode(String provisionMode) { public void setProvisionMode(String provisionMode) {
this.provisionMode = provisionMode; this.provisionMode = provisionMode;
return this;
} }
public String getCompletionStatus() { public String getCompletionStatus() {
return completionStatus; return completionStatus;
} }
public ScholixCollectedFrom setCompletionStatus(String completionStatus) { public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus; this.completionStatus = completionStatus;
return this;
} }
} }

View File

@ -19,17 +19,15 @@ public class ScholixEntityId implements Serializable {
return name; return name;
} }
public ScholixEntityId setName(String name) { public void setName(String name) {
this.name = name; this.name = name;
return this;
} }
public List<ScholixIdentifier> getIdentifiers() { public List<ScholixIdentifier> getIdentifiers() {
return identifiers; return identifiers;
} }
public ScholixEntityId setIdentifiers(List<ScholixIdentifier> identifiers) { public void setIdentifiers(List<ScholixIdentifier> identifiers) {
this.identifiers = identifiers; this.identifiers = identifiers;
return this;
} }
} }

View File

@ -18,17 +18,15 @@ public class ScholixIdentifier implements Serializable {
return identifier; return identifier;
} }
public ScholixIdentifier setIdentifier(String identifier) { public void setIdentifier(String identifier) {
this.identifier = identifier; this.identifier = identifier;
return this;
} }
public String getSchema() { public String getSchema() {
return schema; return schema;
} }
public ScholixIdentifier setSchema(String schema) { public void setSchema(String schema) {
this.schema = schema; this.schema = schema;
return this;
} }
} }

View File

@ -20,26 +20,23 @@ public class ScholixRelationship implements Serializable {
return name; return name;
} }
public ScholixRelationship setName(String name) { public void setName(String name) {
this.name = name; this.name = name;
return this;
} }
public String getSchema() { public String getSchema() {
return schema; return schema;
} }
public ScholixRelationship setSchema(String schema) { public void setSchema(String schema) {
this.schema = schema; this.schema = schema;
return this;
} }
public String getInverse() { public String getInverse() {
return inverse; return inverse;
} }
public ScholixRelationship setInverse(String inverse) { public void setInverse(String inverse) {
this.inverse = inverse; this.inverse = inverse;
return this;
} }
} }

View File

@ -21,6 +21,9 @@ public class ScholixResource implements Serializable {
private List<ScholixCollectedFrom> collectedFrom; private List<ScholixCollectedFrom> collectedFrom;
public static ScholixResource fromSummary(ScholixSummary summary) { public static ScholixResource fromSummary(ScholixSummary summary) {
final ScholixResource resource = new ScholixResource(); final ScholixResource resource = new ScholixResource();
@ -66,80 +69,71 @@ public class ScholixResource implements Serializable {
return identifier; return identifier;
} }
public ScholixResource setIdentifier(List<ScholixIdentifier> identifier) { public void setIdentifier(List<ScholixIdentifier> identifier) {
this.identifier = identifier; this.identifier = identifier;
return this;
} }
public String getDnetIdentifier() { public String getDnetIdentifier() {
return dnetIdentifier; return dnetIdentifier;
} }
public ScholixResource setDnetIdentifier(String dnetIdentifier) { public void setDnetIdentifier(String dnetIdentifier) {
this.dnetIdentifier = dnetIdentifier; this.dnetIdentifier = dnetIdentifier;
return this;
} }
public String getObjectType() { public String getObjectType() {
return objectType; return objectType;
} }
public ScholixResource setObjectType(String objectType) { public void setObjectType(String objectType) {
this.objectType = objectType; this.objectType = objectType;
return this;
} }
public String getObjectSubType() { public String getObjectSubType() {
return objectSubType; return objectSubType;
} }
public ScholixResource setObjectSubType(String objectSubType) { public void setObjectSubType(String objectSubType) {
this.objectSubType = objectSubType; this.objectSubType = objectSubType;
return this;
} }
public String getTitle() { public String getTitle() {
return title; return title;
} }
public ScholixResource setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
return this;
} }
public List<ScholixEntityId> getCreator() { public List<ScholixEntityId> getCreator() {
return creator; return creator;
} }
public ScholixResource setCreator(List<ScholixEntityId> creator) { public void setCreator(List<ScholixEntityId> creator) {
this.creator = creator; this.creator = creator;
return this;
} }
public String getPublicationDate() { public String getPublicationDate() {
return publicationDate; return publicationDate;
} }
public ScholixResource setPublicationDate(String publicationDate) { public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate; this.publicationDate = publicationDate;
return this;
} }
public List<ScholixEntityId> getPublisher() { public List<ScholixEntityId> getPublisher() {
return publisher; return publisher;
} }
public ScholixResource setPublisher(List<ScholixEntityId> publisher) { public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher; this.publisher = publisher;
return this;
} }
public List<ScholixCollectedFrom> getCollectedFrom() { public List<ScholixCollectedFrom> getCollectedFrom() {
return collectedFrom; return collectedFrom;
} }
public ScholixResource setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) { public void setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
this.collectedFrom = collectedFrom; this.collectedFrom = collectedFrom;
return this;
} }
} }

View File

@ -11,6 +11,7 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import java.io.Serializable; import java.io.Serializable;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -24,9 +25,9 @@ public class ScholixSummary implements Serializable {
private String description; private String description;
private List<SchemeValue> subject; private List<SchemeValue> subject;
private List<String> publisher; private List<String> publisher;
private int relatedPublications; private long relatedPublications;
private int relatedDatasets; private long relatedDatasets;
private int relatedUnknown; private long relatedUnknown;
private List<CollectedFromType> datasources; private List<CollectedFromType> datasources;
@ -104,27 +105,27 @@ public class ScholixSummary implements Serializable {
this.publisher = publisher; this.publisher = publisher;
} }
public int getRelatedPublications() { public long getRelatedPublications() {
return relatedPublications; return relatedPublications;
} }
public void setRelatedPublications(int relatedPublications) { public void setRelatedPublications(long relatedPublications) {
this.relatedPublications = relatedPublications; this.relatedPublications = relatedPublications;
} }
public int getRelatedDatasets() { public long getRelatedDatasets() {
return relatedDatasets; return relatedDatasets;
} }
public void setRelatedDatasets(int relatedDatasets) { public void setRelatedDatasets(long relatedDatasets) {
this.relatedDatasets = relatedDatasets; this.relatedDatasets = relatedDatasets;
} }
public int getRelatedUnknown() { public long getRelatedUnknown() {
return relatedUnknown; return relatedUnknown;
} }
public void setRelatedUnknown(int relatedUnknown) { public void setRelatedUnknown(long relatedUnknown) {
this.relatedUnknown = relatedUnknown; this.relatedUnknown = relatedUnknown;
} }
@ -137,6 +138,25 @@ public class ScholixSummary implements Serializable {
} }
public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) {
try {
final ObjectMapper mapper = new ObjectMapper();
final RelatedItemInfo relatedItemInfo = new RelatedItemInfo();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
switch (oafType) {
case dataset:
return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo);
case publication:
return summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo);
case unknown:
return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo);
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
return null;
}
public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) { public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) {
try { try {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
@ -197,7 +217,8 @@ public class ScholixSummary implements Serializable {
.collect(Collectors.toList()) .collect(Collectors.toList())
); );
} }
if (item.getPublisher()!= null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
@ -208,8 +229,6 @@ public class ScholixSummary implements Serializable {
.map( .map(
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()) c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
).collect(Collectors.toList())); ).collect(Collectors.toList()));
return summary; return summary;
} }
@ -249,6 +268,9 @@ public class ScholixSummary implements Serializable {
); );
} }
if (item.getPublisher()!= null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());

View File

@ -83,7 +83,25 @@
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg> <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg> <arg>--graphPath</arg><arg>${graphPath}</arg>
</spark> </spark>
<ok to="indexSummary"/> <ok to="generateScholix"/>
<error to="Kill"/>
</action>
<action name="generateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Scholix</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory 9G --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="indexScholix"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -96,36 +114,17 @@
<name>generate Summary</name> <name>generate Summary</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class> <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --num-executors 20 --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} </spark-opts> <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="64" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg> <arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
<arg>--index</arg><arg>${index}_object</arg> <arg>--index</arg><arg>${index}_object</arg>
</spark> <arg>--idPath</arg><arg>id</arg>
<ok to="generateScholix"/> <arg>--type</arg><arg>summary</arg>
<error to="Kill"/>
</action>
<action name="generateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Scholix</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark> </spark>
<ok to="indexScholix"/> <ok to="indexScholix"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="indexScholix"> <action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
@ -135,15 +134,16 @@
<name>index scholix</name> <name>index scholix</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class> <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --num-executors 20 --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts> <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="16" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_index</arg> <arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
<arg>--index</arg><arg>${index}_scholix</arg> <arg>--index</arg><arg>${index}_scholix</arg>
<arg>--idPath</arg><arg>identifier</arg>
<arg>--type</arg><arg>scholix</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -17,6 +17,13 @@
"paramDescription": "the index name", "paramDescription": "the index name",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "t",
"paramLongName": "type",
"paramDescription": "should be scholix or summary",
"paramRequired": true
},
{ {
"paramName": "id", "paramName": "id",
"paramLongName": "idPath", "paramLongName": "idPath",

View File

@ -0,0 +1,331 @@
{
"mappings": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"linkprovider": {
"type": "nested",
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "keyword"
}
}
},
"publicationDate": {
"type": "keyword"
},
"relationship": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"source": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"target": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

View File

@ -0,0 +1,132 @@
{
"mappings": {
"properties": {
"abstract": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasources": {
"type": "nested",
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasourceId": {
"type": "keyword"
},
"datasourceName": {
"type": "keyword"
}
}
},
"date": {
"type": "keyword"
},
"id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"localIdentifier": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"type": {
"type": "keyword"
}
}
},
"publisher": {
"type": "keyword"
},
"relatedDatasets": {
"type": "long"
},
"relatedPublications": {
"type": "long"
},
"relatedUnknown": {
"type": "long"
},
"subject": {
"properties": {
"scheme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"typology": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

View File

@ -12,12 +12,10 @@ import scala.Tuple2;
public class ExtractInfoTest { public class ExtractInfoTest {
@Test
public void test() throws Exception {
final String json = IOUtils.toString(getClass().getResourceAsStream("record.json"));
ProvisionUtil.getItemType(json,ProvisionUtil.TARGETJSONPATH);
}
@Test @Test
@ -36,23 +34,20 @@ public class ExtractInfoTest {
public void testScholix() throws Exception { public void testScholix() throws Exception {
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
Scholix.generateScholixWithSource(jsonSummary, jsonRelation); Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
} }
@Test @Test
@Ignore
public void testIndex() throws Exception { public void testIndex() throws Exception {
SparkIndexCollectionOnES.main( SparkGenerateScholix.main(
new String[] { new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/dli", "-w", "/Users/sandro/Downloads/scholix/provision",
"-i", "dli_object" "-g", "/Users/sandro/Downloads/scholix/graph"
} }
); );
} }