forked from D-Net/dnet-hadoop
This commit is contained in:
parent
a0b32c4184
commit
5cfaac0790
|
@ -50,7 +50,7 @@ public class ModelSupport {
|
|||
*/
|
||||
public static final Map<String, Class> resultTypes = Maps.newHashMap();
|
||||
|
||||
static{
|
||||
static {
|
||||
resultTypes.put("publication", Publication.class);
|
||||
resultTypes.put("dataset", Dataset.class);
|
||||
resultTypes.put("software", Software.class);
|
||||
|
@ -69,7 +69,6 @@ public class ModelSupport {
|
|||
|
||||
}
|
||||
|
||||
|
||||
public static final Map<String, Class> oafTypes = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -5,43 +6,40 @@ import java.util.List;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
public class ASResultInfo implements Serializable {
|
||||
private String id;
|
||||
private String type; //result or relation
|
||||
private List<InferenceInfo> value; //the community or the project
|
||||
private String id;
|
||||
private String type; // result or relation
|
||||
private List<InferenceInfo> value; // the community or the project
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
public List<InferenceInfo> getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public List<InferenceInfo> getValue() {
|
||||
return value;
|
||||
}
|
||||
public void setValue(List<InferenceInfo> value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public void setValue(List<InferenceInfo> value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
|
||||
public static ASResultInfo copy(ASResultInfo as){
|
||||
ASResultInfo ar = new ASResultInfo();
|
||||
ar.id = as.id;
|
||||
ar.type = as.type;
|
||||
ar.value = as.value.stream().map(InferenceInfo::copy).collect(Collectors.toList());
|
||||
return ar;
|
||||
}
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static ASResultInfo copy(ASResultInfo as) {
|
||||
ASResultInfo ar = new ASResultInfo();
|
||||
ar.id = as.id;
|
||||
ar.type = as.type;
|
||||
ar.value = as.value.stream().map(InferenceInfo::copy).collect(Collectors.toList());
|
||||
return ar;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,42 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ActionSet implements Serializable {
|
||||
private String name;
|
||||
private String rawset;
|
||||
private String directory;
|
||||
private String name;
|
||||
private String rawset;
|
||||
private String directory;
|
||||
|
||||
public String getDirectory() {
|
||||
return directory;
|
||||
}
|
||||
public String getDirectory() {
|
||||
return directory;
|
||||
}
|
||||
|
||||
public void setDirectory(String directory) {
|
||||
this.directory = directory;
|
||||
}
|
||||
public void setDirectory(String directory) {
|
||||
this.directory = directory;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getRawset() {
|
||||
return rawset;
|
||||
}
|
||||
public String getRawset() {
|
||||
return rawset;
|
||||
}
|
||||
|
||||
public void setRawset(String rawset) {
|
||||
this.rawset = rawset;
|
||||
}
|
||||
public void setRawset(String rawset) {
|
||||
this.rawset = rawset;
|
||||
}
|
||||
|
||||
public static ActionSet newInstance(String name, String rawset, String directory){
|
||||
ActionSet as = new ActionSet();
|
||||
as.name = name;
|
||||
as.rawset = rawset;
|
||||
as.directory = directory;
|
||||
return as;
|
||||
public static ActionSet newInstance(String name, String rawset, String directory) {
|
||||
ActionSet as = new ActionSet();
|
||||
as.name = name;
|
||||
as.rawset = rawset;
|
||||
as.directory = directory;
|
||||
return as;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,43 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class InferenceInfo implements Serializable {
|
||||
private String value;
|
||||
private String trust;
|
||||
private String inference_provenance;
|
||||
private String value;
|
||||
private String trust;
|
||||
private String inference_provenance;
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getTrust() {
|
||||
return trust;
|
||||
}
|
||||
public String getTrust() {
|
||||
return trust;
|
||||
}
|
||||
|
||||
public void setTrust(String trust) {
|
||||
this.trust = trust;
|
||||
}
|
||||
public void setTrust(String trust) {
|
||||
this.trust = trust;
|
||||
}
|
||||
|
||||
public String getInference_provenance() {
|
||||
return inference_provenance;
|
||||
}
|
||||
public String getInference_provenance() {
|
||||
return inference_provenance;
|
||||
}
|
||||
|
||||
public void setInference_provenance(String inference_provenance) {
|
||||
this.inference_provenance = inference_provenance;
|
||||
}
|
||||
|
||||
public static InferenceInfo copy(InferenceInfo ii){
|
||||
InferenceInfo iinfo = new InferenceInfo();
|
||||
iinfo.value = ii.value;
|
||||
iinfo.trust = ii.trust;
|
||||
iinfo.inference_provenance = ii.inference_provenance;
|
||||
return iinfo;
|
||||
}
|
||||
public void setInference_provenance(String inference_provenance) {
|
||||
this.inference_provenance = inference_provenance;
|
||||
}
|
||||
|
||||
public static InferenceInfo copy(InferenceInfo ii) {
|
||||
InferenceInfo iinfo = new InferenceInfo();
|
||||
iinfo.value = ii.value;
|
||||
iinfo.trust = ii.trust;
|
||||
iinfo.inference_provenance = ii.inference_provenance;
|
||||
return iinfo;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -15,133 +20,139 @@ import org.apache.spark.sql.Encoders;
|
|||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class PrepareInfo implements Serializable {
|
||||
private String outputPath;
|
||||
private Boolean isSparkSessionManaged;
|
||||
private String asInputPath;
|
||||
private String inputGraphPath;
|
||||
private List<String> actionSetActive;
|
||||
private List<ActionSet> actionSetList;
|
||||
private String outputPath;
|
||||
private Boolean isSparkSessionManaged;
|
||||
private String asInputPath;
|
||||
private String inputGraphPath;
|
||||
private List<String> actionSetActive;
|
||||
private List<ActionSet> actionSetList;
|
||||
|
||||
public PrepareInfo(Boolean isSparkSessionManaged,
|
||||
String outputPath,
|
||||
String asInputPath,
|
||||
String inputGraphPath,
|
||||
List<String> actionSetActive,
|
||||
List<ActionSet> actionSetList) {
|
||||
this.isSparkSessionManaged = isSparkSessionManaged;
|
||||
this.outputPath = outputPath;
|
||||
this.inputGraphPath = inputGraphPath;
|
||||
this.asInputPath = asInputPath;
|
||||
this.actionSetActive = actionSetActive;
|
||||
this.actionSetList = actionSetList;
|
||||
}
|
||||
public PrepareInfo(Boolean isSparkSessionManaged,
|
||||
String outputPath,
|
||||
String asInputPath,
|
||||
String inputGraphPath,
|
||||
List<String> actionSetActive,
|
||||
List<ActionSet> actionSetList) {
|
||||
this.isSparkSessionManaged = isSparkSessionManaged;
|
||||
this.outputPath = outputPath;
|
||||
this.inputGraphPath = inputGraphPath;
|
||||
this.asInputPath = asInputPath;
|
||||
this.actionSetActive = actionSetActive;
|
||||
this.actionSetList = actionSetList;
|
||||
}
|
||||
|
||||
public void run(){
|
||||
public void run() {
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
exec(spark, asInputPath, inputGraphPath, outputPath,
|
||||
actionSetList.stream().map(as -> {
|
||||
if(actionSetActive.contains(as.getName())){
|
||||
return as;
|
||||
}
|
||||
return null;
|
||||
}).filter(Objects::nonNull)
|
||||
.collect(Collectors.toList()));
|
||||
});
|
||||
}
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
exec(
|
||||
spark, asInputPath, inputGraphPath, outputPath,
|
||||
actionSetList.stream().map(as -> {
|
||||
if (actionSetActive.contains(as.getName())) {
|
||||
return as;
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList()));
|
||||
});
|
||||
}
|
||||
|
||||
private static void exec(SparkSession spark, String asInputPath, String graphInputPath, String outputPath,
|
||||
List<ActionSet> actionSets){
|
||||
private static void exec(SparkSession spark, String asInputPath, String graphInputPath, String outputPath,
|
||||
List<ActionSet> actionSets) {
|
||||
|
||||
Dataset<Relation> relation = common.readPath(spark, graphInputPath + "/relation", Relation.class);
|
||||
|
||||
actionSets
|
||||
.forEach(
|
||||
as -> resolveActionSet(
|
||||
spark, asInputPath + "/" + as.getDirectory() + "/" + as.getRawset(),
|
||||
outputPath));
|
||||
|
||||
Dataset<Relation> relation = common.readPath(spark, graphInputPath + "/relation", Relation.class);
|
||||
relation.createOrReplaceTempView("relation");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT source dedupId, collect_set(target) as merges " +
|
||||
"FROM relation " +
|
||||
"WHERE relclass = 'merges' " +
|
||||
"GROUP BY source")
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation/");
|
||||
|
||||
actionSets.forEach(as -> resolveActionSet(spark, asInputPath + "/" + as.getDirectory() + "/" + as.getRawset() ,
|
||||
outputPath ));
|
||||
}
|
||||
|
||||
relation.createOrReplaceTempView("relation");
|
||||
private static void resolveActionSet(SparkSession spark, String asInputPath, String outputPath) {
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
spark.sql("SELECT source dedupId, collect_set(target) as merges " +
|
||||
"FROM relation " +
|
||||
"WHERE relclass = 'merges' " +
|
||||
"GROUP BY source")
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/relation/");
|
||||
spark
|
||||
.createDataset(
|
||||
sc
|
||||
.sequenceFile(asInputPath, Text.class, Text.class)
|
||||
.map(a -> common.OBJECT_MAPPER.readValue(a._2().toString(), AtomicAction.class))
|
||||
.map(aa -> getAsResultInfo(aa))
|
||||
.filter(Objects::nonNull)
|
||||
.rdd(),
|
||||
Encoders.bean(ASResultInfo.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(outputPath + "/actionset/");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void resolveActionSet(SparkSession spark, String asInputPath, String outputPath){
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
private static ASResultInfo getAsResultInfo(AtomicAction aa) {
|
||||
ASResultInfo ri = new ASResultInfo();
|
||||
if (ModelSupport.resultRelationType.get(aa.getClazz()).equals("result")) {
|
||||
|
||||
spark
|
||||
.createDataset(sc.sequenceFile(asInputPath, Text.class, Text.class)
|
||||
.map(a -> common.OBJECT_MAPPER.readValue(a._2().toString(),AtomicAction.class))
|
||||
.map(aa -> getAsResultInfo(aa)).filter(Objects::nonNull)
|
||||
.rdd(), Encoders.bean(ASResultInfo.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(outputPath + "/actionset/");
|
||||
Result result = (Result) aa.getPayload();
|
||||
ri.setId(result.getId());
|
||||
ri.setType("result");
|
||||
List<InferenceInfo> inferenceInfoList = new ArrayList<>();
|
||||
|
||||
}
|
||||
result.getContext().forEach(c -> {
|
||||
String id = c.getId();
|
||||
c.getDataInfo().forEach(di -> {
|
||||
InferenceInfo ii = new InferenceInfo();
|
||||
ii.setValue(id);
|
||||
ii.setTrust(di.getTrust());
|
||||
ii.setInference_provenance(di.getInferenceprovenance());
|
||||
inferenceInfoList.add(ii);
|
||||
});
|
||||
});
|
||||
|
||||
private static ASResultInfo getAsResultInfo(AtomicAction aa) {
|
||||
ASResultInfo ri = new ASResultInfo();
|
||||
if(ModelSupport.resultRelationType.get(aa.getClazz()).equals("result")){
|
||||
ri.setValue(inferenceInfoList);
|
||||
|
||||
Result result = (Result) aa.getPayload();
|
||||
ri.setId(result.getId());
|
||||
ri.setType("result");
|
||||
List<InferenceInfo> inferenceInfoList = new ArrayList<>();
|
||||
|
||||
result.getContext().forEach(c -> {
|
||||
String id = c.getId();
|
||||
c.getDataInfo().forEach(di -> {
|
||||
InferenceInfo ii = new InferenceInfo();
|
||||
ii.setValue(id);
|
||||
ii.setTrust(di.getTrust());
|
||||
ii.setInference_provenance(di.getInferenceprovenance());
|
||||
inferenceInfoList.add(ii);
|
||||
});
|
||||
});
|
||||
|
||||
ri.setValue(inferenceInfoList);
|
||||
|
||||
}else{
|
||||
Relation rel = (Relation)aa.getPayload();
|
||||
if(rel.getSource().startsWith("50|")){
|
||||
ri.setId(rel.getSource());
|
||||
ri.setType("relation");
|
||||
InferenceInfo ii = new InferenceInfo();
|
||||
ii.setInference_provenance(rel.getDataInfo().getInferenceprovenance());
|
||||
ii.setTrust(rel.getDataInfo().getTrust());
|
||||
ii.setValue(rel.getTarget());
|
||||
ri.setValue(Arrays.asList(ii));
|
||||
}else{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return ri;
|
||||
}
|
||||
} else {
|
||||
Relation rel = (Relation) aa.getPayload();
|
||||
if (rel.getSource().startsWith("50|")) {
|
||||
ri.setId(rel.getSource());
|
||||
ri.setType("relation");
|
||||
InferenceInfo ii = new InferenceInfo();
|
||||
ii.setInference_provenance(rel.getDataInfo().getInferenceprovenance());
|
||||
ii.setTrust(rel.getDataInfo().getTrust());
|
||||
ii.setValue(rel.getTarget());
|
||||
ri.setValue(Arrays.asList(ii));
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return ri;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,39 +1,42 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class QueryInformationSystem implements Serializable {
|
||||
|
||||
private static final String ACTION_MANAGER_PATH_QUERY = "for $x in " +
|
||||
"collection(' /db/DRIVER/ServiceResources/ActionManagerServiceResourceType') " +
|
||||
"return data($x//PROPERTY[./@key='basePath']/@value)";
|
||||
private static final String ACTION_MANAGER_PATH_QUERY = "for $x in " +
|
||||
"collection(' /db/DRIVER/ServiceResources/ActionManagerServiceResourceType') " +
|
||||
"return data($x//PROPERTY[./@key='basePath']/@value)";
|
||||
|
||||
private static final String ACTION_SET_QUERY = "for $x in " +
|
||||
"collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " +
|
||||
"return concat (data($x//SET/@id),'@@',data($x//LATEST/@id),'@@',data($x//SET/@directory))";
|
||||
private static final String ACTION_SET_QUERY = "for $x in " +
|
||||
"collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " +
|
||||
"return concat (data($x//SET/@id),'@@',data($x//LATEST/@id),'@@',data($x//SET/@directory))";
|
||||
|
||||
public static String getActionManagerPath(final String isLookupUrl)
|
||||
throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
return isLookUp.quickSearchProfile(ACTION_MANAGER_PATH_QUERY).get(0);
|
||||
public static String getActionManagerPath(final String isLookupUrl)
|
||||
throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
return isLookUp.quickSearchProfile(ACTION_MANAGER_PATH_QUERY).get(0);
|
||||
|
||||
}
|
||||
|
||||
public static List<ActionSet> getActionSet (final String isLookupUrl)
|
||||
throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
return isLookUp.quickSearchProfile(ACTION_SET_QUERY)
|
||||
.stream()
|
||||
.map(res -> {
|
||||
String[] tmp = res.split("@@");
|
||||
return ActionSet.newInstance(tmp[0], tmp[1], tmp[2]);
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
public static List<ActionSet> getActionSet(final String isLookupUrl)
|
||||
throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
return isLookUp
|
||||
.quickSearchProfile(ACTION_SET_QUERY)
|
||||
.stream()
|
||||
.map(res -> {
|
||||
String[] tmp = res.split("@@");
|
||||
return ActionSet.newInstance(tmp[0], tmp[1], tmp[2]);
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,25 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class RelationMerges implements Serializable {
|
||||
private String dedupId;
|
||||
private List<String> merges ;
|
||||
private String dedupId;
|
||||
private List<String> merges;
|
||||
|
||||
public String getDedupId() {
|
||||
return dedupId;
|
||||
}
|
||||
public String getDedupId() {
|
||||
return dedupId;
|
||||
}
|
||||
|
||||
public void setDedupId(String dedupId) {
|
||||
this.dedupId = dedupId;
|
||||
}
|
||||
public void setDedupId(String dedupId) {
|
||||
this.dedupId = dedupId;
|
||||
}
|
||||
|
||||
public List<String> getMerges() {
|
||||
return merges;
|
||||
}
|
||||
public List<String> getMerges() {
|
||||
return merges;
|
||||
}
|
||||
|
||||
public void setMerges(List<String> merges) {
|
||||
this.merges = merges;
|
||||
}
|
||||
public void setMerges(List<String> merges) {
|
||||
this.merges = merges;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,24 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ResultPid implements Serializable {
|
||||
private String resultId;
|
||||
private String doi;
|
||||
private String resultId;
|
||||
private String doi;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -11,80 +16,77 @@ import org.apache.spark.sql.SaveMode;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class SparkExpandResultInfo implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkExpandResultInfo.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkExpandResultInfo.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkExpandResultInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_expand_parameters.json"));
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkExpandResultInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_expand_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String relationInputPath = parser.get("relationInputPath");
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String asInputPath = parser.get("asInputPath");
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
exec(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
exec(spark, relationInputPath, asInputPath, outputPath);
|
||||
});
|
||||
}
|
||||
private static void exec(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<RelationMerges> rel = common.readPath(spark, inputPath + "/relation", RelationMerges.class);
|
||||
|
||||
private static void exec(SparkSession spark, String relationInputPath, String asInputPath, String outputPath){
|
||||
Dataset<RelationMerges> rel = common.readPath(spark, relationInputPath, RelationMerges.class);
|
||||
Dataset<ASResultInfo> asResultInfo = common.readPath(spark, inputPath + "/actionset", ASResultInfo.class);
|
||||
|
||||
Dataset<ASResultInfo> asResultInfo = common.readPath(spark, asInputPath, ASResultInfo.class);
|
||||
asResultInfo
|
||||
.joinWith(rel, asResultInfo.col("id").equalTo(rel.col("dedupId")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<ASResultInfo, RelationMerges>, ASResultInfo>) value -> {
|
||||
ASResultInfo ri = value._1();
|
||||
if (Objects.isNull(value._2())) {
|
||||
return Arrays.asList(ri).iterator();
|
||||
}
|
||||
|
||||
asResultInfo.joinWith(rel, asResultInfo.col("id").equalTo(rel.col("dedupId")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<ASResultInfo, RelationMerges>, ASResultInfo>) value -> {
|
||||
ASResultInfo ri = value._1();
|
||||
if(Objects.isNull(value._2())){
|
||||
return Arrays.asList(ri).iterator();
|
||||
}
|
||||
|
||||
return value._2().getMerges().stream()
|
||||
.map(res -> {
|
||||
ASResultInfo copy = ASResultInfo.copy(ri);
|
||||
copy.setId(res);
|
||||
return copy;
|
||||
}).collect(Collectors.toList()).iterator();
|
||||
|
||||
}, Encoders.bean(ASResultInfo.class)
|
||||
)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
return value
|
||||
._2()
|
||||
.getMerges()
|
||||
.stream()
|
||||
.map(res -> {
|
||||
ASResultInfo copy = ASResultInfo.copy(ri);
|
||||
copy.setId(res);
|
||||
return copy;
|
||||
})
|
||||
.collect(Collectors.toList())
|
||||
.iterator();
|
||||
|
||||
}, Encoders.bean(ASResultInfo.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,71 +1,66 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class SparkPrepareInfo implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkPrepareInfo.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkPrepareInfo.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkPrepareInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_prepare_parameters.json"));
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkPrepareInfo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_prepare_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||
log.info("isLookUpUrl: {}", isLookUpUrl);
|
||||
|
||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||
log.info("isLookUpUrl: {}", isLookUpUrl);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final List<String> actionSetActive = new Gson().fromJson(parser.get("actionSets"), List.class);
|
||||
|
||||
final String asInputPath = QueryInformationSystem.getActionManagerPath(isLookUpUrl);
|
||||
|
||||
final List<ActionSet> actionSetList = QueryInformationSystem.getActionSet(isLookUpUrl);
|
||||
|
||||
PrepareInfo prepareInfo = new PrepareInfo(isSparkSessionManaged, outputPath, asInputPath, inputPath, actionSetActive, actionSetList);
|
||||
|
||||
prepareInfo.run();
|
||||
}
|
||||
final List<String> actionSetActive = new Gson().fromJson(parser.get("actionSets"), List.class);
|
||||
|
||||
final String asInputPath = QueryInformationSystem.getActionManagerPath(isLookUpUrl);
|
||||
|
||||
final List<ActionSet> actionSetList = QueryInformationSystem.getActionSet(isLookUpUrl);
|
||||
|
||||
PrepareInfo prepareInfo = new PrepareInfo(isSparkSessionManaged, outputPath, asInputPath, inputPath,
|
||||
actionSetActive, actionSetList);
|
||||
|
||||
prepareInfo.run();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
|
@ -15,116 +19,116 @@ import org.apache.spark.sql.Encoders;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class SparkRedistributeIISRelations implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkRedistributeIISRelations.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkRedistributeIISRelations.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkRedistributeIISRelations.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_redistribute_parameters.json"));
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkRedistributeIISRelations.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_redistribute_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
final String asInputPath = parser.get("asInputPath");
|
||||
|
||||
final String asInputPath = parser.get("asInputPath");
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
Dataset<ResultPid> resultPidDataset = common.readPath(spark, inputPath, ResultPid.class);
|
||||
Dataset<ASResultInfo> asResultInfoDataset = common.readPath(spark, asInputPath, ASResultInfo.class);
|
||||
execRelation(spark, asResultInfoDataset.filter("type = 'relation'"), resultPidDataset, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
Dataset<ResultPid> resultPidDataset = common.readPath(spark, inputPath , ResultPid.class);
|
||||
Dataset<ASResultInfo> asResultInfoDataset = common.readPath(spark, asInputPath, ASResultInfo.class);
|
||||
execRelation(spark, asResultInfoDataset.filter("type = 'relation'"), resultPidDataset, outputPath);
|
||||
});
|
||||
}
|
||||
private static void execRelation(SparkSession spark, Dataset<ASResultInfo> asResultInfoDataset,
|
||||
Dataset<ResultPid> resultPidDataset, String outputPathRelation) {
|
||||
resultPidDataset
|
||||
.joinWith(
|
||||
asResultInfoDataset, resultPidDataset.col("resultId").equalTo(asResultInfoDataset.col("id")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<ResultPid, ASResultInfo>, Relation>) value -> {
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
if (Objects.nonNull(value._2())) {
|
||||
relationList.add(getRelation(value._2(), "result"));
|
||||
relationList.add(getRelation(value._2(), "project"));
|
||||
|
||||
private static void execRelation(SparkSession spark, Dataset<ASResultInfo> asResultInfoDataset,
|
||||
Dataset<ResultPid> resultPidDataset, String outputPathRelation) {
|
||||
resultPidDataset.joinWith(asResultInfoDataset, resultPidDataset.col("resultId").equalTo(asResultInfoDataset.col("id")), "left")
|
||||
.flatMap((FlatMapFunction<Tuple2<ResultPid,ASResultInfo>, Relation>) value -> {
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
if(Objects.nonNull(value._2())){
|
||||
relationList.add(getRelation(value._2(), "result"));
|
||||
relationList.add(getRelation(value._2(), "project"));
|
||||
}
|
||||
return relationList.iterator();
|
||||
}, Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> getTextTextTuple2(aa))
|
||||
.saveAsHadoopFile(outputPathRelation, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
return relationList.iterator();
|
||||
}, Encoders.bean(Relation.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> getTextTextTuple2(aa))
|
||||
.saveAsHadoopFile(outputPathRelation, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
}
|
||||
private static Relation getRelation(ASResultInfo asResultInfo, String type) {
|
||||
Relation r = new Relation();
|
||||
if (type.equals("result")) {
|
||||
r.setSource(asResultInfo.getId());
|
||||
r.setRelClass("isProducedBy");
|
||||
r.setTarget(asResultInfo.getValue().get(0).getValue());
|
||||
} else {
|
||||
r.setRelClass("produces");
|
||||
r.setSource(asResultInfo.getValue().get(0).getValue());
|
||||
r.setTarget(asResultInfo.getId());
|
||||
}
|
||||
|
||||
private static Relation getRelation(ASResultInfo asResultInfo, String type) {
|
||||
Relation r = new Relation();
|
||||
if(type.equals("result")){
|
||||
r.setSource(asResultInfo.getId());
|
||||
r.setRelClass("isProducedBy");
|
||||
r.setTarget(asResultInfo.getValue().get(0).getValue());
|
||||
}else{
|
||||
r.setRelClass("produces");
|
||||
r.setSource(asResultInfo.getValue().get(0).getValue());
|
||||
r.setTarget(asResultInfo.getId());
|
||||
}
|
||||
r.setRelType("resultProject");
|
||||
r.setSubRelType("outcome");
|
||||
|
||||
r.setDataInfo(getDataInfo(asResultInfo));
|
||||
return r;
|
||||
}
|
||||
|
||||
r.setRelType("resultProject");
|
||||
r.setSubRelType("outcome");
|
||||
private static DataInfo getDataInfo(ASResultInfo asResultInfo) {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInvisible(false);
|
||||
di.setInvisible(true);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust(asResultInfo.getValue().get(0).getTrust());
|
||||
di.setInferenceprovenance(asResultInfo.getValue().get(0).getInference_provenance());
|
||||
Qualifier pAction = new Qualifier();
|
||||
pAction.setClassid("iis");
|
||||
pAction.setClassname("iss");
|
||||
pAction.setSchemename("dnet:provenanceActions");
|
||||
pAction.setSchemeid("dnet:provenanceActions");
|
||||
di.setProvenanceaction(pAction);
|
||||
return di;
|
||||
}
|
||||
|
||||
r.setDataInfo(getDataInfo(asResultInfo));
|
||||
return r;
|
||||
}
|
||||
|
||||
private static DataInfo getDataInfo(ASResultInfo asResultInfo) {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInvisible(false) ;
|
||||
di.setInvisible(true);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust(asResultInfo.getValue().get(0).getTrust());
|
||||
di.setInferenceprovenance(asResultInfo.getValue().get(0).getInference_provenance());
|
||||
Qualifier pAction = new Qualifier();
|
||||
pAction.setClassid("iis");
|
||||
pAction.setClassname("iss");
|
||||
pAction.setSchemename("dnet:provenanceActions");
|
||||
pAction.setSchemeid("dnet:provenanceActions");
|
||||
di.setProvenanceaction(pAction);
|
||||
return di;
|
||||
}
|
||||
|
||||
private static Tuple2<Text, Text> getTextTextTuple2(AtomicAction aa) throws JsonProcessingException {
|
||||
String st = "";
|
||||
System.out.println(st);
|
||||
return new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(common.OBJECT_MAPPER.writeValueAsString(aa)));
|
||||
}
|
||||
private static Tuple2<Text, Text> getTextTextTuple2(AtomicAction aa) throws JsonProcessingException {
|
||||
String st = "";
|
||||
System.out.println(st);
|
||||
return new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(common.OBJECT_MAPPER.writeValueAsString(aa)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
|
@ -17,100 +19,107 @@ import org.apache.spark.sql.Encoders;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class SparkRedistributeIISResult implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkRedistributeIISResult.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkRedistributeIISResult.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkRedistributeIISResult.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_redistribute_parameters.json"));
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkRedistributeIISResult.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_redistribute_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String outputPathRelation = parser.get("outputPathRelation");
|
||||
log.info("outputPathRelation: {}", outputPathRelation);
|
||||
|
||||
final String outputPathRelation = parser.get("outputPathRelation");
|
||||
log.info("outputPathRelation: {}", outputPathRelation);
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
final String asInputPath = parser.get("asInputPath");
|
||||
|
||||
final String asInputPath = parser.get("asInputPath");
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
Dataset<ResultPid> resultPidDataset = common.readPath(spark, inputPath, ResultPid.class);
|
||||
Dataset<ASResultInfo> asResultInfoDataset = common.readPath(spark, asInputPath, ASResultInfo.class);
|
||||
execResult(spark, asResultInfoDataset.filter("type = 'result'"), resultPidDataset, outputPath);
|
||||
// execRelation(spark, asResultInfoDataset.filter("type = 'relation'"), resultPidDataset,
|
||||
// outputPathRelation);
|
||||
});
|
||||
}
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
Dataset<ResultPid> resultPidDataset = common.readPath(spark, inputPath , ResultPid.class);
|
||||
Dataset<ASResultInfo> asResultInfoDataset = common.readPath(spark, asInputPath, ASResultInfo.class);
|
||||
execResult(spark, asResultInfoDataset.filter("type = 'result'"), resultPidDataset, outputPath);
|
||||
// execRelation(spark, asResultInfoDataset.filter("type = 'relation'"), resultPidDataset, outputPathRelation);
|
||||
});
|
||||
}
|
||||
private static void execResult(SparkSession spark, Dataset<ASResultInfo> info, Dataset<ResultPid> resultPidDataset,
|
||||
String outputPathResult) {
|
||||
info
|
||||
.joinWith(resultPidDataset, info.col("id").equalTo(resultPidDataset.col("resultId")), "left")
|
||||
.map((MapFunction<Tuple2<ASResultInfo, ResultPid>, Result>) value -> {
|
||||
Result ri = null;
|
||||
if (Objects.nonNull(value._2())) {
|
||||
ri = new Result();
|
||||
ASResultInfo asri = value._1();
|
||||
ResultPid rp = value._2();
|
||||
ri.setId(value._1().getId());
|
||||
ri
|
||||
.setContext(
|
||||
asri
|
||||
.getValue()
|
||||
.stream()
|
||||
.map(c -> {
|
||||
Context context = new Context();
|
||||
context.setId(c.getValue());
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferenceprovenance(c.getInference_provenance());
|
||||
di.setTrust(c.getTrust());
|
||||
di.setDeletedbyinference(false);
|
||||
Qualifier pa = new Qualifier();
|
||||
pa.setClassname("iis");
|
||||
pa.setClassid("iis");
|
||||
pa.setSchemeid("dnet:provenanceActions");
|
||||
pa.setSchemename("dnet:provenanceActions");
|
||||
di.setProvenanceaction(pa);
|
||||
context.setDataInfo(Arrays.asList(di));
|
||||
return context;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
return ri;
|
||||
}, Encoders.bean(Result.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.mapToPair(r -> getTextTextTuple2(r))
|
||||
.saveAsHadoopFile(outputPathResult, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
private static void execResult(SparkSession spark, Dataset<ASResultInfo> info, Dataset<ResultPid> resultPidDataset, String outputPathResult){
|
||||
info.joinWith(resultPidDataset, info.col("id").equalTo(resultPidDataset.col("resultId")), "left")
|
||||
.map((MapFunction<Tuple2<ASResultInfo,ResultPid>, Result>)value -> {
|
||||
Result ri = null;
|
||||
if(Objects.nonNull(value._2())){
|
||||
ri = new Result();
|
||||
ASResultInfo asri = value._1();
|
||||
ResultPid rp = value._2();
|
||||
ri.setId(value._1().getId());
|
||||
ri.setContext(asri.getValue()
|
||||
.stream()
|
||||
.map(c -> {
|
||||
Context context = new Context();
|
||||
context.setId(c.getValue());
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferenceprovenance(c.getInference_provenance());
|
||||
di.setTrust(c.getTrust());
|
||||
di.setDeletedbyinference(false);
|
||||
Qualifier pa = new Qualifier();
|
||||
pa.setClassname("iis");
|
||||
pa.setClassid("iis");
|
||||
pa.setSchemeid("dnet:provenanceActions");
|
||||
pa.setSchemename("dnet:provenanceActions");
|
||||
di.setProvenanceaction(pa);
|
||||
context.setDataInfo(Arrays.asList(di));
|
||||
return context;
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
return ri;
|
||||
}, Encoders.bean(Result.class))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.mapToPair(r -> getTextTextTuple2(r))
|
||||
.saveAsHadoopFile(outputPathResult, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
private static Tuple2<Text, Text> getTextTextTuple2(Result r) throws JsonProcessingException {
|
||||
AtomicAction aa = new AtomicAction(Result.class, r);
|
||||
return new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(common.OBJECT_MAPPER.writeValueAsString(aa)));
|
||||
}
|
||||
private static Tuple2<Text, Text> getTextTextTuple2(Result r) throws JsonProcessingException {
|
||||
AtomicAction aa = new AtomicAction(Result.class, r);
|
||||
return new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(common.OBJECT_MAPPER.writeValueAsString(aa)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -11,73 +14,75 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkSelectResults implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectResults.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectResults.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectResults.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_select_parameters.json"));
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectResults.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/remapping/input_select_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
// final String resultClassName = parser.get("resultTableName");
|
||||
// log.info("resultTableName: {}", resultClassName);
|
||||
//
|
||||
// Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
run(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
common.removeOutputDir(spark, outputPath);
|
||||
run(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
private static void run(SparkSession spark, String inputPath, String outputPath) {
|
||||
ModelSupport.resultTypes
|
||||
.keySet()
|
||||
.forEach(key -> exec(spark, inputPath + "/" + key, outputPath, ModelSupport.resultTypes.get(key)));
|
||||
}
|
||||
|
||||
private static void run(SparkSession spark, String inputPath, String outputPath){
|
||||
ModelSupport.resultTypes.keySet()
|
||||
.forEach(key -> exec(spark, inputPath + "/" + key, outputPath, ModelSupport.resultTypes.get(key)));
|
||||
}
|
||||
private static <R extends Result> void exec(SparkSession spark, String inputPath, String outputPath,
|
||||
Class<R> resultClazz) {
|
||||
Dataset<R> result = common.readPath(spark, inputPath, resultClazz);
|
||||
|
||||
private static <R extends Result> void exec(SparkSession spark, String inputPath, String outputPath, Class<R> resultClazz ){
|
||||
Dataset<R> result = common.readPath(spark, inputPath, resultClazz);
|
||||
result.createOrReplaceTempView("result");
|
||||
|
||||
result.createOrReplaceTempView("result");
|
||||
|
||||
spark.sql("SELECT id resultId, persId.value doi " +
|
||||
"from result " +
|
||||
"lateral view explode(pid) p as persId " +
|
||||
"lateral view explode(collectedfrom) c as cf " +
|
||||
"where persId.qualifier.classid = 'doi' " +
|
||||
"and (cf.key = '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254' or " +
|
||||
"cf.key = '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2') " +
|
||||
"and result.id not like '50|dedup%' ")
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
spark
|
||||
.sql(
|
||||
"SELECT id resultId, persId.value doi " +
|
||||
"from result " +
|
||||
"lateral view explode(pid) p as persId " +
|
||||
"lateral view explode(collectedfrom) c as cf " +
|
||||
"where persId.qualifier.classid = 'doi' " +
|
||||
"and (cf.key = '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254' or " +
|
||||
"cf.key = '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2') " +
|
||||
"and result.id not like '50|dedup%' ")
|
||||
.write()
|
||||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,27 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
||||
public class common implements Serializable {
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
public static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,15 +7,9 @@
|
|||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "asp",
|
||||
"paramLongName": "asInputPath",
|
||||
"paramDescription": "the allowed list of action sets",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rip",
|
||||
"paramLongName": "relationInputPath",
|
||||
"paramDescription": "the input path",
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the input path of the prepared info",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,182 @@
|
|||
<workflow-app name="RemapIIS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>isLookUpUrl</name>
|
||||
<description>the IsLookUpUrl</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<description>the graph input path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>relationOutputPath</name>
|
||||
<description>path where to store the stable action set for relations</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resultOutputPath</name>
|
||||
<description>path where to store the stable action set for results</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>actionSets</name>
|
||||
<description>the allowed list of action sets</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${outputPath}'/>
|
||||
<mkdir path='${outputPath}'/>
|
||||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="prepare_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepare_info">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareInfo</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.remapping.SparkPrepareInfo</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSets</arg><arg>${actionSets}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="prepare_next"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="prepare_next">
|
||||
<path start="expand_relation"/>
|
||||
<path start="select_results"/>
|
||||
</fork>
|
||||
|
||||
<action name="expand_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExpandRelation</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.remapping.SparkExpandResultInfo</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/expandedActionSet</arg>
|
||||
</spark>
|
||||
<ok to="wait1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_results">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>SelectResults</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.remapping.SparkSelectResults</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/selectedResults</arg>
|
||||
</spark>
|
||||
<ok to="wait1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait1" to="redistributeAA"/>
|
||||
|
||||
<fork name="redistributeAA">
|
||||
<path start="redistribute_relation"/>
|
||||
<path start="redistribute_result"/>
|
||||
</fork>
|
||||
|
||||
<action name="redistribute_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>RedistributeRelation</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.remapping.SparkRedistributeIISRelations</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/selectedResults</arg>
|
||||
<arg>--asInputPath</arg><arg>${workingDir}/expandedActionSet</arg>
|
||||
<arg>--outputPath</arg><arg>${relationOutputPath></arg>
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="redistribute_result">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>SelectResults</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.remapping.SparkRedistributeIISResult</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/selectedResults</arg>
|
||||
<arg>--asInputPath</arg><arg>${workingDir}/expandedActionSet</arg>
|
||||
<arg>--outputPath</arg><arg>${resultOutputPath></arg>
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait2" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,7 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.neethi.Assertion;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -18,123 +21,156 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
|
||||
public class ExpandResultInfoTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = PrepareInfoTest.class
|
||||
.getClassLoader();
|
||||
private static final ClassLoader cl = PrepareInfoTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
private static final String FAKE_ISLOOKUP = "http://beta.services.openaire.eu/";
|
||||
private static SparkSession spark;
|
||||
private static final String FAKE_ISLOOKUP = "http://beta.services.openaire.eu/";
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(ExpandResultInfoTest.class);
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(ExpandResultInfoTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(ExpandResultInfoTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(ExpandResultInfoTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(ExpandResultInfoTest.class.getSimpleName());
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(ExpandResultInfoTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ExpandResultInfoTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ExpandResultInfoTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void expandAS() throws Exception {
|
||||
SparkExpandResultInfo
|
||||
.main(
|
||||
new String[]{
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-asInputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step2/preparedInfo/as/as")
|
||||
.getPath(),
|
||||
"-relationInputPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step2/preparedInfo/relations/relations").getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/expandedActionSet"
|
||||
});
|
||||
@Test
|
||||
public void expandAS() throws Exception {
|
||||
SparkExpandResultInfo
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step2/preparedInfo").getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/expandedActionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/expandedActionSet")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/expandedActionSet")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
Assertions.assertEquals(25, verificationDataset.count());
|
||||
Assertions.assertEquals(25, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.filter("substr(id,1,8) = '50|dedup'").count());
|
||||
Assertions.assertEquals(0, verificationDataset.filter("substr(id,1,8) = '50|dedup'").count());
|
||||
|
||||
Assertions.assertEquals(3, verificationDataset.filter("id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a'").count());
|
||||
Assertions.assertEquals(3, verificationDataset.filter("id = '50|od_______166::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
Assertions.assertEquals(3, verificationDataset.filter("id = '50|od_______165::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
Assertions.assertEquals(3, verificationDataset.filter("id = '50|od______3515::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verificationDataset.filter("id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verificationDataset.filter("id = '50|od_______166::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verificationDataset.filter("id = '50|od_______165::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verificationDataset.filter("id = '50|od______3515::779de9b3a2d224779be52fae43b5fc80'").count());
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.filter("id = '50|doiboost____::78329557c23bee513963ebf295d1434d'").count());
|
||||
Assertions.assertEquals(2, verificationDataset.filter("id = '50|doiboost____::8978b9b797294da5306950a94a58d98c'").count());
|
||||
Assertions.assertEquals(2, verificationDataset.filter("id = '50|doiboost____::fb2c70723d74f45329640255a959333d'").count());
|
||||
Assertions.assertEquals(2, verificationDataset.filter("id = '50|base_oa_____::fb2c70723d74f45329640255a959333d'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset.filter("id = '50|doiboost____::78329557c23bee513963ebf295d1434d'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset.filter("id = '50|doiboost____::8978b9b797294da5306950a94a58d98c'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset.filter("id = '50|doiboost____::fb2c70723d74f45329640255a959333d'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset.filter("id = '50|base_oa_____::fb2c70723d74f45329640255a959333d'").count());
|
||||
|
||||
Assertions.assertEquals(5, verificationDataset.filter("id = '50|_____OmicsDI::039dbb63f11b19dc15113b34ebceb0d2' " +
|
||||
"or id = '50|_____OmicsDI::05f133acca27d72866c6720a95515f57' or " +
|
||||
"id = '50|_____OmicsDI::2d508eba981699a30e969d1ab5a068b8' or " +
|
||||
"id = '50|datacite____::00bddedc38dc045780dc84c27bc8fecd' or " +
|
||||
"id = '50|datacite____::00f7f89392fa75e944dc8d329e9e8024'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, verificationDataset
|
||||
.filter(
|
||||
"id = '50|_____OmicsDI::039dbb63f11b19dc15113b34ebceb0d2' " +
|
||||
"or id = '50|_____OmicsDI::05f133acca27d72866c6720a95515f57' or " +
|
||||
"id = '50|_____OmicsDI::2d508eba981699a30e969d1ab5a068b8' or " +
|
||||
"id = '50|datacite____::00bddedc38dc045780dc84c27bc8fecd' or " +
|
||||
"id = '50|datacite____::00f7f89392fa75e944dc8d329e9e8024'")
|
||||
.count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
Assertions.assertEquals(25, verify.count());
|
||||
|
||||
Assertions.assertEquals(25, verify.count());
|
||||
Assertions.assertEquals(20, verify.filter("type = 'relation'").count());
|
||||
Assertions.assertEquals(5, verify.filter("type = 'result'").count());
|
||||
|
||||
Assertions.assertEquals(20, verify.filter("type = 'relation'").count());
|
||||
Assertions.assertEquals(5, verify.filter("type = 'result'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.897'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.897'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.897'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.897'").count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.7085'").count());
|
||||
}
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"and value = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8' and " +
|
||||
"prov = 'iis::document_referencedProjects' and " +
|
||||
"trust = '0.7085'")
|
||||
.count());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
@ -14,343 +19,544 @@ import org.junit.jupiter.api.*;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
@Disabled
|
||||
public class PrepareInfoTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = PrepareInfoTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
private static final String FAKE_ISLOOKUP = "http://beta.services.openaire.eu/";
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(PrepareInfoTest.class);
|
||||
|
||||
private static final List<ActionSet> actionSetList =
|
||||
Arrays.asList(ActionSet.newInstance("iis-dataset-entities-preprocessing", "rawset_74155091-ce3b-4951-849c-f41dd4186699_1555613756167","entities_dataset"),
|
||||
ActionSet.newInstance("iis-document-statistics","rawset_d0a24381-1241-4d83-9669-22110ab72f63_1415965369474","document_statistics"),
|
||||
ActionSet.newInstance("iis-researchinitiative", "rawset_718b528a-0a10-4303-9290-4f61c04b7ace_1594030710764","document_research_initiative"),
|
||||
ActionSet.newInstance("iis-document-citations","rawset_b22d36c2-36e5-4ff3-97ef-946fa84a57dd_1594030710774","document_referencedDocuments"),
|
||||
ActionSet.newInstance("iis-dataset-entities-main","rawset_fffa6131-3e7d-4c2c-82d8-844517e721c0_1594030710760","entities_dataset"),
|
||||
ActionSet.newInstance("iis-document-affiliation","rawset_d62066d3-b6d9-424a-bea0-bab884a55292_1594030710732","matched_doc_organizations"),
|
||||
ActionSet.newInstance("iis-document-classes","rawset_35b252fb-2180-4115-818d-a8110616b892_1594030710771","document_classes"),
|
||||
ActionSet.newInstance("iis-document-similarities","rawset_cc4706b4-ed1d-4862-a13b-b0afdd7016a3_1594030710768","document_similarities_standard"),
|
||||
ActionSet.newInstance("iis-referenced-datasets-main","rawset_c2ea95d3-a2c0-48f4-9184-5f3478399cc6_1594030710757","document_referencedDatasets"),
|
||||
ActionSet.newInstance("iis-referenced-datasets-preprocessing","rawset_91543cfa-b543-46c1-a87f-b1a550bc6937_1555613756158","document_referencedDatasets"),
|
||||
ActionSet.newInstance("iis-referenced-projects-main","rawset_ccf0d39d-0077-4e61-af6c-dc191e2fca68_1594030710754","document_referencedProjects"),
|
||||
ActionSet.newInstance("iis-referenced-projects-preprocessing","rawset_564ae405-a221-472f-8bd0-ee8bfbbd9164_1555613756135","document_referencedProjects"),
|
||||
ActionSet.newInstance("iis-referenceextraction-pdb","rawset_07dc3b63-e5a4-4a54-90ba-5226fd55f1c9_1594030710776","document_pdb"),
|
||||
ActionSet.newInstance("document_software_url","rawset_75e2b097-2a10-41a7-97eb-70737b678793_1594030710779","document_software_url"),
|
||||
ActionSet.newInstance("iis-wos-entities","rawset_handled6-0ab3-4fd0-a33b-refereed2fc0_1555613756163","entities_document"),
|
||||
ActionSet.newInstance("iis-extracted-metadata","rawset_f859722c-bfec-4711-9132-8b24766c208d_1415965369473","document_metadata"),
|
||||
ActionSet.newInstance("dedup-similarity-organization-simple","rawset_a62362a8-9800-4ba3-a060-1fbc0e3ea1a5_1587998691703","dedup-similarity-organization-simple"),
|
||||
ActionSet.newInstance("dedup-similarity-organization","rawset_9da5e0f1-a49a-40fc-aaac-17de80fa5ceb_1436537292583","dedup-similarity-organization"),
|
||||
ActionSet.newInstance("dedup-similarity-result-levenstein","rawset_4921d674-aea3-4115-ad33-fe6833569176_1587984647217","dedup-similarity-result-levenstein"),
|
||||
ActionSet.newInstance("dedup-similarity-person","","dedup-similarity-person"),
|
||||
ActionSet.newInstance("iis-entities-software","rawset_c4b060b5-d620-45dd-9a0a-25befb23ef7c_1594030710782","entities_software"),
|
||||
ActionSet.newInstance("iis-communities","rawset_4c632429-6f12-4f18-b54f-e60b346859d7_1594030710791","document_community"),
|
||||
ActionSet.newInstance("scholexplorer-dump","rawset_d349ffdd-384a-47f6-986f-6c04edee3294_1592572750","scholexplorer-dump"),
|
||||
ActionSet.newInstance("gridac-dump","rawset_a2854367-3586-4945-a124-1328e91568bd_1571646606840","gridac-dump"),
|
||||
ActionSet.newInstance("doiboost-organizations","rawset_7626c52f-7f17-47f0-9094-da2c6b883d41_1574951682027","doiboost-organizations"),
|
||||
ActionSet.newInstance("doiboost","rawset_handledb-a3ae-4d6e-8187-refereed6e18_15912730340000","doiboost"),
|
||||
ActionSet.newInstance("orcidworks-no-doi","rawset_handledf-ef8d-4e1d-89b6-refereed6ce6_1574862348031","orcidworks-no-doi"),
|
||||
ActionSet.newInstance("iis-entities-patent","rawset_4dfd999e-7e3c-48eb-be92-92393da81e19_1594030710806","entities_patent"),
|
||||
ActionSet.newInstance("iis-referenced-patents","rawset_b4387e83-ed2c-4c4b-9d1d-4da35a6ad752_1594030710803","document_patent"),
|
||||
ActionSet.newInstance("iis-covid-19","rawset_437833b5-b2ef-4e3f-8642-e845086ccc2c_1594030710809","document_covid19"),
|
||||
ActionSet.newInstance("h2020programme","rawset_bcca8d44-6139-4aec-b579-761552440162_1590697435148","h2020programme"));
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(PrepareInfoTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(PrepareInfoTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(PrepareInfoTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRelationsAS() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-referenced-projects-main\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.show(false);
|
||||
System.out.println(verificationDataset.count()); //1813019 unidirectional relations
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(1813019, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(44804, verificationDataset.filter("type = 'result'").count());
|
||||
Assertions.assertEquals(1768215, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("substr(id,1,3) = '40|'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|od________18::c8e57f11074407d59f7114f047afd54e'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|od________18::c8e57f11074407d59f7114f047afd54e' and type = 'relation'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|od________18::c8e57f11074407d59f7114f047afd54e' and value = '40|nsf_________::2bedb915e92b7dd25b082c6c2f241085'").count());
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.filter("id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f'").count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and type = 'relation' and value = '40|anr_________::5437f242b18aa615acf57dced27975c6'").count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and type = 'relation' and value = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521'").count());
|
||||
|
||||
Assertions.assertEquals(1828481, verify.filter("prov = 'iis::document_referencedProjects'").count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da'").count());
|
||||
Assertions.assertEquals(2, verify.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da'").count());
|
||||
Assertions.assertEquals(1, verify.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da' and value = 'mes::projects::307'").count());
|
||||
Assertions.assertEquals(1, verify.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da' and value = 'mes::projects::421'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInitiative() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-researchinitiative\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(14505, verificationDataset.filter("type = 'result'").count());
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_research_initiative'").count());
|
||||
|
||||
Assertions.assertEquals(14639, verify.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce' and value = 'egi::virtual::10256'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce' and value = 'egi::virtual::10256' " +
|
||||
"and trust = '0.9'").count());
|
||||
|
||||
Assertions.assertEquals(2, verify.filter("id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255' and value = 'egi::virtual::150'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255' and value = 'egi::virtual::71' " +
|
||||
"and trust = '0.9'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommunities() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-communities\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_community'").count());
|
||||
|
||||
Assertions.assertEquals(1129, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1395, verify.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|core_ac_uk__::2aab0d504dae88edc6d7214f4ab62e4f'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|core_ac_uk__::2aab0d504dae88edc6d7214f4ab62e4f' and value = 'dariah' " +
|
||||
"and trust = '0.9'").count());
|
||||
|
||||
Assertions.assertEquals(2, verify.filter("id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f' and value = 'clarin'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f' and value = 'dh-ch::subcommunity::2' " +
|
||||
"and trust = '0.9'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCovid19() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-covid-19\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_covid19'").count());
|
||||
|
||||
Assertions.assertEquals(45093, verify.filter("value = 'covid-19'").count());
|
||||
|
||||
Assertions.assertEquals(45093, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(3, verify.filter("id = '50|_____OmicsDI::039dbb63f11b19dc15113b34ebceb0d2' or " +
|
||||
"id = '50|_____OmicsDI::05f133acca27d72866c6720a95515f57' or " +
|
||||
"id = '50|_____OmicsDI::19c2cff8e86d7ae39f7e34f43ee06735'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAll() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-researchinitiative\",\"iis-referenced-projects-main\",\"iis-communities\",\"iis-covid-19\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql(("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(1768215, verificationDataset.filter("type = 'relation'").count());
|
||||
Assertions.assertEquals((45093 + 1129 + 14505 + 44804), verificationDataset.filter("type = 'result'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRelationMerged(){
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<RelationMerges> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/relation")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, RelationMerges.class));
|
||||
|
||||
Dataset<RelationMerges> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(RelationMerges.class));
|
||||
|
||||
verificationDataset.show(false);
|
||||
Assertions.assertEquals(3, verificationDataset.count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark.sql("Select dedupId, me merged " +
|
||||
"from verificationDataset " +
|
||||
"lateral view explode(merges) m as me");
|
||||
|
||||
Assertions.assertEquals(8, verify.count());
|
||||
Assertions.assertEquals(2 , verify.filter("dedupId = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f'").count());
|
||||
Assertions.assertEquals(2 , verify.filter("dedupId = '50|dedup_wf_001::7df4b3b26df271628a837c209516902a'").count());
|
||||
Assertions.assertEquals(4 , verify.filter("dedupId = '50|dedup_wf_001::b04d742132c133177e996add1325ec04'").count());
|
||||
|
||||
Assertions.assertEquals(2, verify.filter("dedupId = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and (me ='50|base_oa_____::fb2c70723d74f45329640255a959333d' or me = '50|doiboost____::fb2c70723d74f45329640255a959333d')").count());
|
||||
|
||||
Assertions.assertEquals(2 , verify.filter("dedupId = '50|dedup_wf_001::7df4b3b26df271628a837c209516902a' " +
|
||||
"and (me ='50|doiboost____::78329557c23bee513963ebf295d1434d' or me = '50|doiboost____::8978b9b797294da5306950a94a58d98c')").count());
|
||||
|
||||
Assertions.assertEquals(4 , verify.filter("dedupId = '50|dedup_wf_001::b04d742132c133177e996add1325ec04' " +
|
||||
"and (me = '50|od______3515::779de9b3a2d224779be52fae43b5fc80' or me = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' " +
|
||||
"or me = '50|od_______166::779de9b3a2d224779be52fae43b5fc80' or me = '50|od_______165::779de9b3a2d224779be52fae43b5fc80')").count());
|
||||
|
||||
}
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = PrepareInfoTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
private static final String FAKE_ISLOOKUP = "http://beta.services.openaire.eu/";
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(PrepareInfoTest.class);
|
||||
|
||||
private static final List<ActionSet> actionSetList = Arrays
|
||||
.asList(
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-dataset-entities-preprocessing", "rawset_74155091-ce3b-4951-849c-f41dd4186699_1555613756167",
|
||||
"entities_dataset"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-document-statistics", "rawset_d0a24381-1241-4d83-9669-22110ab72f63_1415965369474",
|
||||
"document_statistics"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-researchinitiative", "rawset_718b528a-0a10-4303-9290-4f61c04b7ace_1594030710764",
|
||||
"document_research_initiative"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-document-citations", "rawset_b22d36c2-36e5-4ff3-97ef-946fa84a57dd_1594030710774",
|
||||
"document_referencedDocuments"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-dataset-entities-main", "rawset_fffa6131-3e7d-4c2c-82d8-844517e721c0_1594030710760",
|
||||
"entities_dataset"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-document-affiliation", "rawset_d62066d3-b6d9-424a-bea0-bab884a55292_1594030710732",
|
||||
"matched_doc_organizations"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-document-classes", "rawset_35b252fb-2180-4115-818d-a8110616b892_1594030710771",
|
||||
"document_classes"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-document-similarities", "rawset_cc4706b4-ed1d-4862-a13b-b0afdd7016a3_1594030710768",
|
||||
"document_similarities_standard"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenced-datasets-main", "rawset_c2ea95d3-a2c0-48f4-9184-5f3478399cc6_1594030710757",
|
||||
"document_referencedDatasets"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenced-datasets-preprocessing",
|
||||
"rawset_91543cfa-b543-46c1-a87f-b1a550bc6937_1555613756158", "document_referencedDatasets"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenced-projects-main", "rawset_ccf0d39d-0077-4e61-af6c-dc191e2fca68_1594030710754",
|
||||
"document_referencedProjects"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenced-projects-preprocessing",
|
||||
"rawset_564ae405-a221-472f-8bd0-ee8bfbbd9164_1555613756135", "document_referencedProjects"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenceextraction-pdb", "rawset_07dc3b63-e5a4-4a54-90ba-5226fd55f1c9_1594030710776",
|
||||
"document_pdb"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"document_software_url", "rawset_75e2b097-2a10-41a7-97eb-70737b678793_1594030710779",
|
||||
"document_software_url"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-wos-entities", "rawset_handled6-0ab3-4fd0-a33b-refereed2fc0_1555613756163",
|
||||
"entities_document"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-extracted-metadata", "rawset_f859722c-bfec-4711-9132-8b24766c208d_1415965369473",
|
||||
"document_metadata"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"dedup-similarity-organization-simple", "rawset_a62362a8-9800-4ba3-a060-1fbc0e3ea1a5_1587998691703",
|
||||
"dedup-similarity-organization-simple"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"dedup-similarity-organization", "rawset_9da5e0f1-a49a-40fc-aaac-17de80fa5ceb_1436537292583",
|
||||
"dedup-similarity-organization"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"dedup-similarity-result-levenstein", "rawset_4921d674-aea3-4115-ad33-fe6833569176_1587984647217",
|
||||
"dedup-similarity-result-levenstein"),
|
||||
ActionSet.newInstance("dedup-similarity-person", "", "dedup-similarity-person"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-entities-software", "rawset_c4b060b5-d620-45dd-9a0a-25befb23ef7c_1594030710782",
|
||||
"entities_software"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-communities", "rawset_4c632429-6f12-4f18-b54f-e60b346859d7_1594030710791",
|
||||
"document_community"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"scholexplorer-dump", "rawset_d349ffdd-384a-47f6-986f-6c04edee3294_1592572750",
|
||||
"scholexplorer-dump"),
|
||||
ActionSet
|
||||
.newInstance("gridac-dump", "rawset_a2854367-3586-4945-a124-1328e91568bd_1571646606840", "gridac-dump"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"doiboost-organizations", "rawset_7626c52f-7f17-47f0-9094-da2c6b883d41_1574951682027",
|
||||
"doiboost-organizations"),
|
||||
ActionSet.newInstance("doiboost", "rawset_handledb-a3ae-4d6e-8187-refereed6e18_15912730340000", "doiboost"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"orcidworks-no-doi", "rawset_handledf-ef8d-4e1d-89b6-refereed6ce6_1574862348031",
|
||||
"orcidworks-no-doi"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-entities-patent", "rawset_4dfd999e-7e3c-48eb-be92-92393da81e19_1594030710806",
|
||||
"entities_patent"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-referenced-patents", "rawset_b4387e83-ed2c-4c4b-9d1d-4da35a6ad752_1594030710803",
|
||||
"document_patent"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"iis-covid-19", "rawset_437833b5-b2ef-4e3f-8642-e845086ccc2c_1594030710809", "document_covid19"),
|
||||
ActionSet
|
||||
.newInstance(
|
||||
"h2020programme", "rawset_bcca8d44-6139-4aec-b579-761552440162_1590697435148", "h2020programme"));
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(PrepareInfoTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(PrepareInfoTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(PrepareInfoTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRelationsAS() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-referenced-projects-main\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.show(false);
|
||||
System.out.println(verificationDataset.count()); // 1813019 unidirectional relations
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(1813019, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(44804, verificationDataset.filter("type = 'result'").count());
|
||||
Assertions.assertEquals(1768215, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("substr(id,1,3) = '40|'").count());
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|od________18::c8e57f11074407d59f7114f047afd54e'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter("id = '50|od________18::c8e57f11074407d59f7114f047afd54e' and type = 'relation'")
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"id = '50|od________18::c8e57f11074407d59f7114f047afd54e' and value = '40|nsf_________::2bedb915e92b7dd25b082c6c2f241085'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset.filter("id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and type = 'relation' and value = '40|anr_________::5437f242b18aa615acf57dced27975c6'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and type = 'relation' and value = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(1828481, verify.filter("prov = 'iis::document_referencedProjects'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verificationDataset
|
||||
.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da'")
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
verify.filter("type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da' and value = 'mes::projects::307'")
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"type = 'result' and id = '50|coactionpubl::127fa9acff1ee8e86da354d1da2378da' and value = 'mes::projects::421'")
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInitiative() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-researchinitiative\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(14505, verificationDataset.filter("type = 'result'").count());
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_research_initiative'").count());
|
||||
|
||||
Assertions.assertEquals(14639, verify.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce' and value = 'egi::virtual::10256'")
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"id = '50|_____OmicsDI::278d318ee9f051971236234b181d79ce' and value = 'egi::virtual::10256' " +
|
||||
"and trust = '0.9'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, verify.filter("id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter("id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255' and value = 'egi::virtual::150'")
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"id = '50|dedup_wf_001::be0f7ddf838f07be2ab62e343244a255' and value = 'egi::virtual::71' " +
|
||||
"and trust = '0.9'")
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommunities() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-communities\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_community'").count());
|
||||
|
||||
Assertions.assertEquals(1129, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1395, verify.count());
|
||||
|
||||
Assertions.assertEquals(1, verify.filter("id = '50|core_ac_uk__::2aab0d504dae88edc6d7214f4ab62e4f'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verify
|
||||
.filter(
|
||||
"id = '50|core_ac_uk__::2aab0d504dae88edc6d7214f4ab62e4f' and value = 'dariah' " +
|
||||
"and trust = '0.9'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, verify.filter("id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify.filter("id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f' and value = 'clarin'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
verify
|
||||
.filter(
|
||||
"id = '50|core_ac_uk__::2e09ba90d7f712f24a4f48b39571f15f' and value = 'dh-ch::subcommunity::2' "
|
||||
+
|
||||
"and trust = '0.9'")
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCovid19() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[\"iis-covid-19\"]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.filter("type = 'relation'").count());
|
||||
|
||||
Assertions.assertEquals(0, verify.filter("prov != 'iis::document_covid19'").count());
|
||||
|
||||
Assertions.assertEquals(45093, verify.filter("value = 'covid-19'").count());
|
||||
|
||||
Assertions.assertEquals(45093, verificationDataset.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verify
|
||||
.filter(
|
||||
"id = '50|_____OmicsDI::039dbb63f11b19dc15113b34ebceb0d2' or " +
|
||||
"id = '50|_____OmicsDI::05f133acca27d72866c6720a95515f57' or " +
|
||||
"id = '50|_____OmicsDI::19c2cff8e86d7ae39f7e34f43ee06735'")
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAll() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson()
|
||||
.fromJson(
|
||||
"[\"iis-researchinitiative\",\"iis-referenced-projects-main\",\"iis-communities\",\"iis-covid-19\"]",
|
||||
List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<ASResultInfo> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/actionset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ASResultInfo.class));
|
||||
|
||||
Dataset<ASResultInfo> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ASResultInfo.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
("SELECT id, type, val.value value, val.trust trust, val.inference_provenance prov " +
|
||||
"FROM verificationDataset " +
|
||||
"LATERAL VIEW EXPLODE(value) v as val"));
|
||||
|
||||
Assertions.assertEquals(1768215, verificationDataset.filter("type = 'relation'").count());
|
||||
Assertions.assertEquals((45093 + 1129 + 14505 + 44804), verificationDataset.filter("type = 'result'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRelationMerged() {
|
||||
PrepareInfo pi = new PrepareInfo(false,
|
||||
workingDir.toString() + "/preparedInfo",
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1/asInputPath").getPath(),
|
||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/remapping/step1").getPath(),
|
||||
new Gson().fromJson("[]", List.class),
|
||||
actionSetList);
|
||||
|
||||
pi.run();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<RelationMerges> tmp = sc
|
||||
.textFile(workingDir.toString() + "/preparedInfo/relation")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, RelationMerges.class));
|
||||
|
||||
Dataset<RelationMerges> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(RelationMerges.class));
|
||||
|
||||
verificationDataset.show(false);
|
||||
Assertions.assertEquals(3, verificationDataset.count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
Dataset<Row> verify = spark
|
||||
.sql(
|
||||
"Select dedupId, me merged " +
|
||||
"from verificationDataset " +
|
||||
"lateral view explode(merges) m as me");
|
||||
|
||||
Assertions.assertEquals(8, verify.count());
|
||||
Assertions
|
||||
.assertEquals(2, verify.filter("dedupId = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f'").count());
|
||||
Assertions
|
||||
.assertEquals(2, verify.filter("dedupId = '50|dedup_wf_001::7df4b3b26df271628a837c209516902a'").count());
|
||||
Assertions
|
||||
.assertEquals(4, verify.filter("dedupId = '50|dedup_wf_001::b04d742132c133177e996add1325ec04'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verify
|
||||
.filter(
|
||||
"dedupId = '50|dedup_wf_001::1cba00616e303863c34fadaf797d0f8f' " +
|
||||
"and (me ='50|base_oa_____::fb2c70723d74f45329640255a959333d' or me = '50|doiboost____::fb2c70723d74f45329640255a959333d')")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verify
|
||||
.filter(
|
||||
"dedupId = '50|dedup_wf_001::7df4b3b26df271628a837c209516902a' " +
|
||||
"and (me ='50|doiboost____::78329557c23bee513963ebf295d1434d' or me = '50|doiboost____::8978b9b797294da5306950a94a58d98c')")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4, verify
|
||||
.filter(
|
||||
"dedupId = '50|dedup_wf_001::b04d742132c133177e996add1325ec04' " +
|
||||
"and (me = '50|od______3515::779de9b3a2d224779be52fae43b5fc80' or me = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' "
|
||||
+
|
||||
"or me = '50|od_______166::779de9b3a2d224779be52fae43b5fc80' or me = '50|od_______165::779de9b3a2d224779be52fae43b5fc80')")
|
||||
.count());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -19,136 +20,164 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class RedistributeIISResultTest {
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = RedistributeIISResultTest.class
|
||||
.getClassLoader();
|
||||
private static final ClassLoader cl = RedistributeIISResultTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(RedistributeIISResultTest.class);
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(RedistributeIISResultTest.class);
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(RedistributeIISResultTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(RedistributeIISResultTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(RedistributeIISResultTest.class.getSimpleName());
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(RedistributeIISResultTest.class.getSimpleName());
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(RedistributeIISResultTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(RedistributeIISResultTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
@Test
|
||||
public void redistributeRelationTest() throws Exception {
|
||||
SparkRedistributeIISRelations
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-asInputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/actionset")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/relationActionSet",
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/result")
|
||||
.getPath()
|
||||
});
|
||||
|
||||
@Test
|
||||
public void redistributeRelationTest() throws Exception {
|
||||
SparkRedistributeIISRelations
|
||||
.main(
|
||||
new String[]{
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-asInputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/actionset")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/relationActionSet",
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/result")
|
||||
.getPath()
|
||||
});
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/relationActionSet", Text.class, Text.class)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/relationActionSet", Text.class, Text.class)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation)aa.getPayload()));
|
||||
Dataset<Relation> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
|
||||
Dataset<Relation> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
Assertions.assertEquals(14, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(14, verificationDataset.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, verificationDataset
|
||||
.filter(
|
||||
"source = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' and " +
|
||||
"(target = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c' or " +
|
||||
"target = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3' or " +
|
||||
"target = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8')")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(3, verificationDataset.filter("source = '50|doiboost____::0f10b8f21b7925a344f41edb774f0b0a' and " +
|
||||
"(target = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c' or " +
|
||||
"target = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3' or " +
|
||||
"target = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8')").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset
|
||||
.filter(
|
||||
"source = '50|doiboost____::fb2c70723d74f45329640255a959333d' and " +
|
||||
"(target = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521' or target = '40|anr_________::5437f242b18aa615acf57dced27975c6') ")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.filter("source = '50|doiboost____::fb2c70723d74f45329640255a959333d' and " +
|
||||
"(target = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521' or target = '40|anr_________::5437f242b18aa615acf57dced27975c6') ").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, verificationDataset
|
||||
.filter(
|
||||
"source = '50|doiboost____::8978b9b797294da5306950a94a58d98c' and " +
|
||||
"(target = '40|anr_________::55e85886263bf5abe9e28ba4fda9f4ce' or target = '40|anr_________::5eafc553789cd97a12cab7ed1742e2ca') ")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.filter("source = '50|doiboost____::8978b9b797294da5306950a94a58d98c' and " +
|
||||
"(target = '40|anr_________::55e85886263bf5abe9e28ba4fda9f4ce' or target = '40|anr_________::5eafc553789cd97a12cab7ed1742e2ca') ").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|anr_________::5437f242b18aa615acf57dced27975c6'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|anr_________::5eafc553789cd97a12cab7ed1742e2ca'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|anr_________::55e85886263bf5abe9e28ba4fda9f4ce'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, verificationDataset.filter("target = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c'").count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|dfgf________::7aa0b0185d7db055823cd0734ddd6521'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|anr_________::5437f242b18aa615acf57dced27975c6'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|anr_________::5eafc553789cd97a12cab7ed1742e2ca'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|anr_________::55e85886263bf5abe9e28ba4fda9f4ce'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|rcuk________::5e312e08bd65f126d7d79b3d1d677eb3'").count());
|
||||
Assertions.assertEquals(1, verificationDataset.filter("target = '40|rcuk________::8dec51859e6b66cd040670b432b9e59c'").count());
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void redistributeTestResult() throws Exception {
|
||||
SparkRedistributeIISResult
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-asInputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/actionset")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/resultActionSet",
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/result")
|
||||
.getPath()
|
||||
});
|
||||
|
||||
@Test
|
||||
public void redistributeTestResult() throws Exception {
|
||||
SparkRedistributeIISResult
|
||||
.main(
|
||||
new String[]{
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-asInputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/actionset")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/resultActionSet",
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step4/result")
|
||||
.getPath()
|
||||
});
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
JavaRDD<Result> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/resultActionSet", Text.class, Text.class)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Result) aa.getPayload()));
|
||||
|
||||
JavaRDD<Result> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/resultActionSet", Text.class, Text.class)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Result)aa.getPayload()));
|
||||
|
||||
Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("verificationDataset");
|
||||
|
||||
// Assertions.assertEquals(0, verificationDataset.filter("substr(id,1,8) = '50|dedup'").count());
|
||||
//
|
||||
|
@ -194,6 +223,6 @@ public class RedistributeIISResultTest {
|
|||
// "and value = '40|corda_______::6d500f8fceb2bb81b0750820469e1cd8' and " +
|
||||
// "prov = 'iis::document_referencedProjects' and " +
|
||||
// "trust = '0.7085'").count());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.remapping;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
@ -15,82 +19,78 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class SelectResultTest {
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = SelectResultTest.class
|
||||
.getClassLoader();
|
||||
private static final ClassLoader cl = SelectResultTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SelectResultTest.class);
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SelectResultTest.class);
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SelectResultTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SelectResultTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SelectResultTest.class.getSimpleName());
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SelectResultTest.class.getSimpleName());
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SelectResultTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SelectResultTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
@Test
|
||||
public void testSelectResult() throws Exception {
|
||||
SparkSelectResults
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step3")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/selectedResults"
|
||||
});
|
||||
|
||||
@Test
|
||||
public void testSelectResult() throws Exception {
|
||||
SparkSelectResults
|
||||
.main(
|
||||
new String[]{
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/remapping/step3")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/selectedResults"
|
||||
});
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
JavaRDD<ResultPid> tmp = sc
|
||||
.textFile(workingDir.toString() + "/selectedResults")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ResultPid.class));
|
||||
|
||||
JavaRDD<ResultPid> tmp = sc
|
||||
.textFile(workingDir.toString() + "/selectedResults")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ResultPid.class));
|
||||
Dataset<ResultPid> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ResultPid.class));
|
||||
|
||||
Dataset<ResultPid> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(ResultPid.class));
|
||||
verificationDataset.show(false);
|
||||
|
||||
verificationDataset.show(false);
|
||||
Assertions.assertEquals(5, verificationDataset.count());
|
||||
Assertions.assertEquals(3, verificationDataset.filter("substr(resultId,1,11) = '50|doiboost'").count());
|
||||
Assertions.assertEquals(2, verificationDataset.filter("substr(resultId,1,11) = '50|datacite'").count());
|
||||
|
||||
Assertions.assertEquals(5, verificationDataset.count());
|
||||
Assertions.assertEquals(3, verificationDataset.filter("substr(resultId,1,11) = '50|doiboost'").count());
|
||||
Assertions.assertEquals(2, verificationDataset.filter("substr(resultId,1,11) = '50|datacite'").count());
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -72,8 +72,9 @@ public class ResultTagger implements Serializable {
|
|||
// tagging for Subject
|
||||
final Set<String> subjects = new HashSet<>();
|
||||
|
||||
if (Objects.nonNull(result.getSubject())){
|
||||
result.getSubject()
|
||||
if (Objects.nonNull(result.getSubject())) {
|
||||
result
|
||||
.getSubject()
|
||||
.stream()
|
||||
.map(subject -> subject.getValue())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
@ -91,13 +92,13 @@ public class ResultTagger implements Serializable {
|
|||
|
||||
if (Objects.nonNull(result.getInstance())) {
|
||||
for (Instance i : result.getInstance()) {
|
||||
if(Objects.nonNull(i.getCollectedfrom())){
|
||||
if(Objects.nonNull(i.getCollectedfrom().getKey())){
|
||||
if (Objects.nonNull(i.getCollectedfrom())) {
|
||||
if (Objects.nonNull(i.getCollectedfrom().getKey())) {
|
||||
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
|
||||
}
|
||||
}
|
||||
if(Objects.nonNull(i.getHostedby())){
|
||||
if(Objects.nonNull(i.getHostedby().getKey())){
|
||||
if (Objects.nonNull(i.getHostedby())) {
|
||||
if (Objects.nonNull(i.getHostedby().getKey())) {
|
||||
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue