This commit is contained in:
parent
c3be9a7b14
commit
e2b9989199
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
|
@ -5,11 +6,7 @@ package eu.dnetlib.dhp.skgif.model;
|
|||
* @Date 04/09/23
|
||||
*/
|
||||
public enum AccessRight {
|
||||
OPEN("open"),
|
||||
CLOSED("closed"),
|
||||
EMBARGO("embargo"),
|
||||
RESTRICTED("restricted"),
|
||||
UNAVAILABLE("unavailable");
|
||||
OPEN("open"), CLOSED("closed"), EMBARGO("embargo"), RESTRICTED("restricted"), UNAVAILABLE("unavailable");
|
||||
|
||||
public final String label;
|
||||
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/09/23
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -25,4 +26,11 @@ public class Dates implements Serializable {
|
|||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public static Dates newInstance(String value, String type) {
|
||||
Dates d = new Dates();
|
||||
d.value = value;
|
||||
d.type = type;
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -26,4 +27,11 @@ public class Identifier implements Serializable {
|
|||
this.value = value;
|
||||
}
|
||||
|
||||
public static Identifier newInstance(String scheme, String value) {
|
||||
Identifier i = new Identifier();
|
||||
i.value = value;
|
||||
i.scheme = scheme;
|
||||
return i;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
|
@ -20,7 +21,7 @@ public class Manifestation implements Serializable {
|
|||
private String peerReview;
|
||||
@JsonProperty("metadata_curation")
|
||||
private String metadataCuration;
|
||||
private URL url;
|
||||
private String url;
|
||||
private String pid;
|
||||
@JsonProperty("access_right")
|
||||
private String accessRight;
|
||||
|
@ -72,11 +73,11 @@ public class Manifestation implements Serializable {
|
|||
this.metadataCuration = metadataCuration;
|
||||
}
|
||||
|
||||
public URL getUrl() {
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(URL url) {
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
|
@ -5,9 +6,7 @@ package eu.dnetlib.dhp.skgif.model;
|
|||
* @Date 04/09/23
|
||||
*/
|
||||
public enum MetadataCuration {
|
||||
YES("yes"),
|
||||
NO("no"),
|
||||
UNAVAILABLE("unavailable");
|
||||
YES("yes"), NO("no"), UNAVAILABLE("unavailable");
|
||||
|
||||
public final String label;
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
|
@ -5,12 +6,8 @@ package eu.dnetlib.dhp.skgif.model;
|
|||
* @Date 04/09/23
|
||||
*/
|
||||
public enum PeerReview {
|
||||
PEER_REVIEWED("peer-reviewed"),
|
||||
NON_PEER_REVIEWED("not peer-reviewed"),
|
||||
DOUBLE_BLIND("double-blind"),
|
||||
SINGLE_BLIND("single-blind"),
|
||||
UNAVAILABLE("unavailable"),
|
||||
OPEN("open peer review");
|
||||
PEER_REVIEWED("peer-reviewed"), NON_PEER_REVIEWED("not peer-reviewed"), DOUBLE_BLIND("double-blind"), SINGLE_BLIND(
|
||||
"single-blind"), UNAVAILABLE("unavailable"), OPEN("open peer review");
|
||||
|
||||
public final String label;
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import org.codehaus.jackson.annotate.JsonProperty;
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import org.codehaus.jackson.annotate.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 05/09/23
|
||||
|
@ -19,7 +20,7 @@ public class Persons implements Serializable {
|
|||
private String familyName;
|
||||
private String agent;
|
||||
@JsonProperty("declared_affiliations")
|
||||
private List<Affiliation>declaredAffiliations;
|
||||
private List<Affiliation> declaredAffiliations;
|
||||
|
||||
public String getLocalIdentifier() {
|
||||
return localIdentifier;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -7,12 +8,8 @@ import java.io.Serializable;
|
|||
* @Date 05/09/23
|
||||
*/
|
||||
public enum RelationType implements Serializable {
|
||||
OUTCOME("outcome"),
|
||||
AFFILIATION("hasAuthorInstitution"),
|
||||
SUPPLEMENT("IsSupplementedBy"),
|
||||
DOCUMENTS("IsDocumentedBy"),
|
||||
PART("IsPartOf"),
|
||||
VERSION("IsNewVersioneOf");
|
||||
OUTCOME("outcome"), AFFILIATION("hasAuthorInstitution"), SUPPLEMENT("IsSupplementedBy"), DOCUMENTS(
|
||||
"IsDocumentedBy"), PART("IsPartOf"), VERSION("IsNewVersioneOf");
|
||||
|
||||
public final String label;
|
||||
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import eu.dnetlib.dhp.oa.model.graph.Relation;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
|
@ -15,6 +18,13 @@ public class Relations implements Serializable {
|
|||
@JsonProperty("product_list")
|
||||
private List<String> productList;
|
||||
|
||||
public static Relations newInstance(String relClass, List<String> target) {
|
||||
Relations r = new Relations();
|
||||
r.relationType = relClass;
|
||||
r.productList = target;
|
||||
return r;
|
||||
}
|
||||
|
||||
public String getRelationType() {
|
||||
return relationType;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
|
@ -17,7 +18,7 @@ public class ResearchProduct implements Serializable {
|
|||
private List<String> abstracts;
|
||||
@JsonProperty("product_type")
|
||||
private String productType;
|
||||
private List<Topic> topics;
|
||||
private List<ResultTopic> topics;
|
||||
private List<Contribution> contributions;
|
||||
private List<Manifestation> manifestations;
|
||||
@JsonProperty("relevant_organizations")
|
||||
|
@ -66,11 +67,11 @@ public class ResearchProduct implements Serializable {
|
|||
this.productType = productType;
|
||||
}
|
||||
|
||||
public List<Topic> getTopics() {
|
||||
public List<ResultTopic> getTopics() {
|
||||
return topics;
|
||||
}
|
||||
|
||||
public void setTopics(List<Topic> topics) {
|
||||
public void setTopics(List<ResultTopic> topics) {
|
||||
this.topics = topics;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
/**
|
||||
|
@ -5,10 +6,7 @@ package eu.dnetlib.dhp.skgif.model;
|
|||
* @Date 01/09/23
|
||||
*/
|
||||
public enum ResearchTypes {
|
||||
LITERATURE("literature"),
|
||||
RESEARCH_DATA("research data"),
|
||||
RESEARCH_SOFTWARE("research software"),
|
||||
OTHER("other");
|
||||
LITERATURE("literature"), RESEARCH_DATA("research data"), RESEARCH_SOFTWARE("research software"), OTHER("other");
|
||||
|
||||
public final String label;
|
||||
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class ResultTopic implements Serializable {
|
||||
private String topic;
|
||||
private Provenance provenance;
|
||||
|
||||
public String getTopic() {
|
||||
return topic;
|
||||
}
|
||||
|
||||
public void setTopic(String topic) {
|
||||
this.topic = topic;
|
||||
}
|
||||
|
||||
public Provenance getProvenance() {
|
||||
return provenance;
|
||||
}
|
||||
|
||||
public void setProvenance(Provenance provenance) {
|
||||
this.provenance = provenance;
|
||||
}
|
||||
}
|
|
@ -1,28 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 01/09/23
|
||||
*/
|
||||
public class Topic implements Serializable {
|
||||
private String topic;
|
||||
private Provenance provenance;
|
||||
private String local_identifier;
|
||||
private List<Identifier> identifiers;
|
||||
private String name;
|
||||
|
||||
public String getTopic() {
|
||||
return topic;
|
||||
public String getLocal_identifier() {
|
||||
return local_identifier;
|
||||
}
|
||||
|
||||
public void setTopic(String topic) {
|
||||
this.topic = topic;
|
||||
public void setLocal_identifier(String local_identifier) {
|
||||
this.local_identifier = local_identifier;
|
||||
}
|
||||
|
||||
public Provenance getProvenance() {
|
||||
return provenance;
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setProvenance(Provenance provenance) {
|
||||
this.provenance = provenance;
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,304 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.skgif.Utils.getOrcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.skgif.beans.PartialResearchProduct;
|
||||
import eu.dnetlib.dhp.skgif.beans.RelationPerProduct;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 06/02/24
|
||||
*/
|
||||
public class DumpResult implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DumpResult.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/dump_result_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
mapResult(spark, inputPath, outputPath, workingDir);
|
||||
});
|
||||
}
|
||||
|
||||
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||
public static <R extends Result> void mapResult(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
|
||||
// selection of the relevant relations from result type to other entity. Only teh semantic relevant ones are
|
||||
// considered
|
||||
selectRelations(spark, inputPath, workingDir);
|
||||
|
||||
// merge of relations and manifestation for the same result
|
||||
getRelationAndManifestation(spark, workingDir, inputPath);
|
||||
|
||||
// dump of the result and enrichment with relevant information for relations and manifestations
|
||||
dumpResult(spark, inputPath, workingDir);
|
||||
|
||||
}
|
||||
|
||||
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
|
||||
Dataset<RelationPerProduct> aggRelations = Utils
|
||||
.readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
|
||||
ModelSupport.entityTypes
|
||||
.keySet()
|
||||
.parallelStream()
|
||||
.filter(ModelSupport::isResult)
|
||||
.forEach(e -> {
|
||||
Dataset<Datasource> datasource = Utils
|
||||
.readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() &&
|
||||
d.getEosctype().getClassname().equalsIgnoreCase("Journal archive"));
|
||||
|
||||
Dataset<EmitPerManifestation> man = Utils
|
||||
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
|
||||
man
|
||||
.joinWith(aggRelations, man.col("resultId").equalTo(aggRelations.col("resultId")), "left")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<EmitPerManifestation, RelationPerProduct>, String>) t2 -> t2
|
||||
._1()
|
||||
.getResultId(),
|
||||
Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, RelationPerProduct>, PartialResearchProduct>) (
|
||||
k, v) -> {
|
||||
PartialResearchProduct prp = new PartialResearchProduct();
|
||||
prp.setResultId(k);
|
||||
List<EmitPerManifestation> epms = new ArrayList<>();
|
||||
Tuple2<EmitPerManifestation, RelationPerProduct> first = v.next();
|
||||
RelationPerProduct rpp = first._2();
|
||||
epms.add(first._1());
|
||||
v.forEachRemaining(t2 -> epms.add(t2._1()));
|
||||
Dataset<EmitPerManifestation> emitformanifestation = spark
|
||||
.createDataset(epms, Encoders.bean(EmitPerManifestation.class));
|
||||
prp.setManifestations(getManifestationList(emitformanifestation, datasource));
|
||||
prp.setRelatedProducts(rpp.getRelatedProduct());
|
||||
prp.setRelevantOrganizations(rpp.getOrganizations());
|
||||
prp.setFunding(rpp.getFunding());
|
||||
return prp;
|
||||
}, Encoders.bean(PartialResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/partialResearchproduct");
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static List<Manifestation> getManifestationList(Dataset<EmitPerManifestation> emitformanifestation,
|
||||
Dataset<Datasource> datasource) {
|
||||
return emitformanifestation
|
||||
.joinWith(
|
||||
datasource, emitformanifestation
|
||||
.col("hostedBy")
|
||||
.equalTo(datasource.col("id")),
|
||||
"left")
|
||||
.map((MapFunction<Tuple2<EmitPerManifestation, Datasource>, Manifestation>) t2 -> {
|
||||
// se il lato sinistro c'e' allora ho la biblio e la venue
|
||||
// se non c'e' allora ho solo gli altri valori
|
||||
EmitPerManifestation epm = t2._1();
|
||||
Manifestation manifestation = new Manifestation();
|
||||
manifestation.setProductLocalTypeSchema(epm.getInstance().getInstancetype().getClassname());
|
||||
manifestation.setProductLocalTypeSchema(epm.getInstance().getInstancetype().getSchemename());
|
||||
manifestation
|
||||
.setDates(
|
||||
Arrays
|
||||
.asList(
|
||||
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
|
||||
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
|
||||
switch (epm.getInstance().getRefereed().getClassid()) {
|
||||
case "0000":
|
||||
manifestation.setPeerReview(PeerReview.UNAVAILABLE.label);
|
||||
break;
|
||||
case "0001":
|
||||
manifestation.setPeerReview(PeerReview.PEER_REVIEWED.label);
|
||||
break;
|
||||
case "0002":
|
||||
manifestation.setPeerReview(PeerReview.NON_PEER_REVIEWED.label);
|
||||
break;
|
||||
}
|
||||
|
||||
manifestation.setMetadataCuration("unavailable");
|
||||
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
|
||||
switch (epm.getInstance().getAccessright().getClassid()) {
|
||||
case "OPEN":
|
||||
case "OPEN DATA":
|
||||
case "OPEN SOURCE":
|
||||
manifestation.setAccessRight(AccessRight.OPEN.label);
|
||||
break;
|
||||
case "CLOSED":
|
||||
manifestation.setAccessRight(AccessRight.CLOSED.label);
|
||||
break;
|
||||
case "RESTRICTED":
|
||||
manifestation.setAccessRight(AccessRight.RESTRICTED.label);
|
||||
break;
|
||||
case "EMBARGO":
|
||||
case "12MONTHS":
|
||||
case "6MONTHS":
|
||||
manifestation.setAccessRight(AccessRight.EMBARGO.label);
|
||||
break;
|
||||
default:
|
||||
manifestation.setAccessRight(AccessRight.UNAVAILABLE.label);
|
||||
|
||||
}
|
||||
manifestation.setLicence(epm.getInstance().getLicense().getValue());
|
||||
manifestation.setUrl(epm.getInstance().getUrl().get(0));
|
||||
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent()) {
|
||||
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
|
||||
}
|
||||
if (Optional.ofNullable(t2._2()).isPresent())
|
||||
manifestation.setBiblio(getBiblio(epm));
|
||||
manifestation.setVenue("venue_______::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
|
||||
manifestation
|
||||
.setHostingDatasource("datasource__::" + DHPUtils.md5(epm.getInstance().getHostedby().getKey()));
|
||||
return manifestation;
|
||||
}, Encoders.bean(Manifestation.class))
|
||||
.collectAsList();
|
||||
}
|
||||
|
||||
private static Biblio getBiblio(EmitPerManifestation epm) {
|
||||
Biblio biblio = new Biblio();
|
||||
biblio.setEdition(epm.getJournal().getEdition());
|
||||
biblio.setIssue(epm.getJournal().getIss());
|
||||
biblio.setPublisher(epm.getPublisher());
|
||||
biblio.setVolume(epm.getJournal().getVol());
|
||||
biblio.setEndPage(epm.getJournal().getEp());
|
||||
biblio.setStartPage(epm.getJournal().getSp());
|
||||
return biblio;
|
||||
}
|
||||
|
||||
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir) {
|
||||
ModelSupport.entityTypes
|
||||
.keySet()
|
||||
.parallelStream()
|
||||
.filter(ModelSupport::isResult)
|
||||
.forEach(e -> {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
|
||||
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
|
||||
Dataset<PartialResearchProduct> prr = Utils
|
||||
.readPath(spark, workingDir + e.name() + "/partialresearchproduct", PartialResearchProduct.class);
|
||||
|
||||
results
|
||||
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
|
||||
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
|
||||
ResearchProduct rp = ResultMapper.map(t2._1());
|
||||
rp.setRelatedProducts(t2._2().getRelatedProducts());
|
||||
rp.setFunding(t2._2().getFunding());
|
||||
rp.setRelevantOrganizations(t2._2().getRelevantOrganizations());
|
||||
rp.setManifestations(rp.getManifestations());
|
||||
return rp;
|
||||
}, Encoders.bean(ResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/researchproduct");
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
|
||||
Dataset<Relation> relation = spark
|
||||
.read()
|
||||
.json(inputPath + "/relation")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.filter("dataInfo.deletedbyinference != true and dataInfo.invisible != true")
|
||||
.filter(
|
||||
"relClass == 'HasAuthorInstitution' or relClass == 'IsProducedBy' or " +
|
||||
"relClass == 'IsSupplementedBy' or relClass == 'IsDocumentedBy' or relClass == 'IsPartOf' " +
|
||||
"relClass == 'IsNewVersionOf' or relClass == 'Cites'");
|
||||
|
||||
relation
|
||||
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Relation, RelationPerProduct>) (k, v) -> {
|
||||
RelationPerProduct rpp = new RelationPerProduct();
|
||||
rpp.setResultId(k);
|
||||
Map<String, List<String>> remainignRelations = new HashMap<>();
|
||||
while (v.hasNext()) {
|
||||
Relation rel = v.next();
|
||||
String target = rel.getTarget();
|
||||
String relClass = rel.getRelClass();
|
||||
switch (rel.getRelClass().toLowerCase()) {
|
||||
case "hasauthorinstitution":
|
||||
rpp.getOrganizations().add("organization::" + DHPUtils.md5(target));
|
||||
break;
|
||||
case "isproducedby":
|
||||
rpp.getFunding().add("grant_______::" + DHPUtils.md5(target));
|
||||
break;
|
||||
default:
|
||||
if (!remainignRelations.keySet().contains(relClass))
|
||||
remainignRelations.put(relClass, new ArrayList<>());
|
||||
remainignRelations.get(relClass).add("product_____::" + DHPUtils.md5(target));
|
||||
}
|
||||
}
|
||||
for (String key : remainignRelations.keySet())
|
||||
rpp.getRelatedProduct().add(Relations.newInstance(key, remainignRelations.get(key)));
|
||||
return rpp;
|
||||
}, Encoders.bean(RelationPerProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + "/aggrelation");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.skgif.Utils.getOrcid;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.skgif.model.*;
|
||||
import eu.dnetlib.dhp.skgif.model.AccessRight;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Array;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 06/02/24
|
||||
*/
|
||||
public class EmitFromResults implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EmitFromResults.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/emit_biblio_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
emitFromResult(spark, inputPath, outputPath, workingDir);
|
||||
});
|
||||
}
|
||||
|
||||
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
|
||||
public static <R extends Result> void emitFromResult(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
emitManifestation(spark, inputPath, workingDir);
|
||||
emitPerson(spark, inputPath, outputPath, workingDir);
|
||||
emitTopic(spark, inputPath, outputPath, workingDir);
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitTopic(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.filter((FilterFunction<R>) r -> Optional.of(r.getSubject()).isPresent())
|
||||
.flatMap(
|
||||
(FlatMapFunction<R, Topic>) r -> r
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(s -> !s.getQualifier().getClassid().equalsIgnoreCase("keyword"))
|
||||
.map(s -> {
|
||||
Topic t = new Topic();
|
||||
t
|
||||
.setLocal_identifier(
|
||||
"topic_______::" + DHPUtils.md5(s.getQualifier().getSchemeid() + s.getValue()));
|
||||
t
|
||||
.setIdentifiers(
|
||||
Arrays
|
||||
.asList(
|
||||
Identifier.newInstance(s.getQualifier().getSchemeid(), s.getValue())));
|
||||
t.setName(s.getValue());
|
||||
return t;
|
||||
})
|
||||
.collect(Collectors.toList())
|
||||
.iterator(),
|
||||
Encoders.bean(Topic.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/topic");
|
||||
}
|
||||
});
|
||||
Dataset<Topic> topics = spark.emptyDataset(Encoders.bean(Topic.class));
|
||||
|
||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||
if (ModelSupport.isResult(entityType))
|
||||
topics = topics.union(Utils.readPath(spark, workingDir + entityType.name() + "/topic", Topic.class));
|
||||
}
|
||||
topics
|
||||
.groupByKey((MapFunction<Topic, String>) p -> p.getLocal_identifier(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Topic, Topic>) (k, v) -> v.next(), Encoders.bean(Topic.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/Topic");
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitPerson(SparkSession spark, String inputPath, String outputPath,
|
||||
String workingDir) {
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.flatMap((FlatMapFunction<R, Persons>) r -> {
|
||||
List<Persons> authors = new ArrayList<>();
|
||||
|
||||
if (Optional.ofNullable(r.getAuthor()).isPresent()) {
|
||||
int count = 0;
|
||||
for (Author a : r.getAuthor()) {
|
||||
count += 1;
|
||||
Persons p = new Persons();
|
||||
p.setFamilyName(a.getSurname());
|
||||
p.setGivenName(a.getName());
|
||||
String identifier = new String();
|
||||
if (Optional.ofNullable(a.getPid()).isPresent()) {
|
||||
Tuple2<String, Boolean> orcid = getOrcid(a.getPid());
|
||||
if (orcid != null) {
|
||||
identifier = "person______::" + DHPUtils.md5(orcid._1() + orcid._2());
|
||||
if (orcid._2())
|
||||
p
|
||||
.setIdentifiers(
|
||||
Arrays.asList(Identifier.newInstance("orcid", orcid._1())));
|
||||
else
|
||||
p
|
||||
.setIdentifiers(
|
||||
Arrays.asList(Identifier.newInstance("orcid_pending", orcid._1())));
|
||||
} else {
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
identifier = "tmp_person__::" + DHPUtils.md5(r.getId() + a.getRank());
|
||||
} else {
|
||||
identifier = "tmp_person__::" + DHPUtils.md5(r.getId() + count);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
p.setLocalIdentifier(identifier);
|
||||
}
|
||||
|
||||
}
|
||||
return authors.iterator();
|
||||
}, Encoders.bean(Persons.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/person");
|
||||
}
|
||||
});
|
||||
Dataset<Persons> persons = spark.emptyDataset(Encoders.bean(Persons.class));
|
||||
|
||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||
if (ModelSupport.isResult(entityType))
|
||||
persons = persons
|
||||
.union(Utils.readPath(spark, workingDir + entityType.name() + "/person", Persons.class));
|
||||
}
|
||||
persons
|
||||
.groupByKey((MapFunction<Persons, String>) p -> p.getLocalIdentifier(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Persons, Persons>) (k, v) -> v.next(), Encoders.bean(Persons.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/Persons");
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> void emitManifestation(SparkSession spark, String inputPath, String workingDir) {
|
||||
Dataset<Datasource> datasource = Utils
|
||||
.readPath(spark, inputPath + "datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() &&
|
||||
d.getEosctype().getClassname().equalsIgnoreCase("Journal archive"));
|
||||
|
||||
ModelSupport.entityTypes.keySet().forEach(e -> {
|
||||
if (ModelSupport.isResult(e)) {
|
||||
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
|
||||
// Dataset<EmitPerManifestation> emitformanifestation =
|
||||
Utils
|
||||
.readPath(spark, inputPath + e.name(), resultClazz)
|
||||
.flatMap((FlatMapFunction<R, EmitPerManifestation>) p -> p.getInstance().stream().map(i -> {
|
||||
EmitPerManifestation epb = new EmitPerManifestation();
|
||||
epb.setResultId(p.getId());
|
||||
epb.setInstance(i);
|
||||
epb.setHostedBy(i.getHostedby().getKey());
|
||||
epb
|
||||
.setPublisher(
|
||||
Optional
|
||||
.ofNullable(p.getPublisher())
|
||||
.map(v -> v.getValue())
|
||||
.orElse(new String()));
|
||||
if (p.getClass() == Publication.class) {
|
||||
epb.setJournal(((Publication) p).getJournal());
|
||||
}
|
||||
return epb;
|
||||
}).collect(Collectors.toList()).iterator(), Encoders.bean(EmitPerManifestation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir + e.name() + "/manifestation");
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
package eu.dnetlib.dhp.skgif;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 06/02/24
|
||||
*/
|
||||
public class JournalsFromDatasources implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(JournalsFromDatasources.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/journals_from_datasource_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
final String datasourcePath = parser.get("datasourcePath");
|
||||
log.info("datasourcePath: {}", datasourcePath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
extendResult(spark, inputPath, outputPath, datasourcePath);
|
||||
});
|
||||
}
|
||||
|
||||
//find the results having a container in the metadata
|
||||
//map all the hostedby.key delle istanze associate al risultato
|
||||
//find a corrispondence to a datasource which is a journal
|
||||
//write for the result the biblio
|
||||
public static void extendResult(SparkSession spark, String inputPath, String outputPath, String datasourcePath ){
|
||||
Dataset<Datasource> datasource = Utils.readPath(spark, datasourcePath, Datasource.class)
|
||||
.filter((FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEosctype()).isPresent() &&
|
||||
d.getEosctype().getClassname().equalsIgnoreCase("Journal archive");
|
||||
|
||||
Dataset<ResearchProduct> results = Utils.readPath(spark, inputPath, ResearchProduct.class);
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -9,6 +9,7 @@ import java.util.*;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -58,17 +59,23 @@ public class PrepareResultRelation implements Serializable {
|
|||
}
|
||||
|
||||
private static void prepareResultRelationList(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
final StructType structureSchema = new StructType()
|
||||
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||
.add(
|
||||
"dataInfo", new StructType()
|
||||
.add("deletedbyinference", DataTypes.BooleanType)
|
||||
.add("invisible", DataTypes.BooleanType))
|
||||
.add("id", DataTypes.StringType);
|
||||
|
||||
Dataset<Relation> relation = spark
|
||||
.read()
|
||||
.json(inputPath)
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.filter("dataInfo.deletedbyinference != true and dataInfo.invisible != true")
|
||||
.filter(
|
||||
"relClass == 'hasAuthorInstitution' or relClass == 'outcome' or " +
|
||||
"subRelType == 'affiliation' or subRelType == 'outcome' or " +
|
||||
"relClass == 'IsSupplementedBy' or relClass == 'IsDocumentedBy' or relClass == 'IsPartOf' " +
|
||||
"relClass == IsNewVersionOf");
|
||||
"relClass == 'IsNewVersionOf' or relClass == 'Cites'");
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> df = spark.createDataFrame(new ArrayList<Row>(), structureSchema);
|
||||
List<String> entities = Arrays
|
||||
|
|
|
@ -41,7 +41,7 @@ public class ResultMapper implements Serializable {
|
|||
Optional<Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
try {
|
||||
out.setLocalIdentifier(input.getId());
|
||||
out.setLocalIdentifier("product___::" + DHPUtils.md5(input.getId()));
|
||||
mapPid(out, input);
|
||||
mapTitle(out, input);
|
||||
mapAbstract(out, input);
|
||||
|
@ -49,17 +49,7 @@ public class ResultMapper implements Serializable {
|
|||
mapTopic(out, input);
|
||||
mapContribution(out, input);
|
||||
|
||||
if (!Optional.ofNullable(out.getTitles()).isPresent() ||
|
||||
!Optional.ofNullable(out.getContributions()).isPresent())
|
||||
return null;
|
||||
// TODO map the manifestation directly from the instances
|
||||
// it is not completed
|
||||
mapManifestation(out, input);
|
||||
|
||||
// TODO extend the mapping to consider relations between these entities and the results
|
||||
// private List<String> relevantOrganizations;
|
||||
// private List<String> funding;
|
||||
// private List<Relations> relatedProducts;
|
||||
//The manifestation will be included extending the result as well as the relations to funder, organization and other results
|
||||
|
||||
} catch (ClassCastException cce) {
|
||||
return null;
|
||||
|
@ -70,101 +60,6 @@ public class ResultMapper implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapManifestation(ResearchProduct out, E input) {
|
||||
out
|
||||
.setManifestations(
|
||||
input
|
||||
.getInstance()
|
||||
.stream()
|
||||
.parallel()
|
||||
.map(i -> {
|
||||
try {
|
||||
return getManifestation(i);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private static Manifestation getManifestation(Instance i) throws MalformedURLException {
|
||||
Manifestation manifestation = new Manifestation();
|
||||
manifestation.setProductLocalType(i.getInstancetype().getClassname());
|
||||
manifestation.setProductLocalTypeSchema(i.getInstancetype().getSchemename());
|
||||
Dates dates = new Dates();
|
||||
dates.setType("publishing");
|
||||
dates.setValue(i.getDateofacceptance().getValue());
|
||||
manifestation.setDates(Arrays.asList(dates));
|
||||
switch (i.getRefereed().getClassid()) {
|
||||
case "0000":
|
||||
manifestation.setPeerReview(PeerReview.UNAVAILABLE.label);
|
||||
break;
|
||||
case "0001":
|
||||
manifestation.setPeerReview(PeerReview.PEER_REVIEWED.label);
|
||||
break;
|
||||
case "0002":
|
||||
manifestation.setPeerReview(PeerReview.NON_PEER_REVIEWED.label);
|
||||
break;
|
||||
}
|
||||
manifestation.setMetadataCuration(MetadataCuration.UNAVAILABLE.label);
|
||||
// TODO filter out the URL that refer to pids. If nothing remains, decide what to do
|
||||
manifestation.setUrl(new URL(i.getUrl().get(0)));
|
||||
if (Optional.ofNullable(i.getPid()).isPresent()) {
|
||||
manifestation.setPid(i.getPid().get(0).getValue());
|
||||
}
|
||||
switch (i.getAccessright().getClassid()) {
|
||||
case "OPEN":
|
||||
case "OPEN DATA":
|
||||
case "OPEN SOURCE":
|
||||
manifestation.setAccessRight(AccessRight.OPEN.label);
|
||||
break;
|
||||
case "CLOSED":
|
||||
manifestation.setAccessRight(AccessRight.CLOSED.label);
|
||||
break;
|
||||
case "RESTRICTED":
|
||||
manifestation.setAccessRight(AccessRight.RESTRICTED.label);
|
||||
break;
|
||||
case "EMBARGO":
|
||||
case "12MONTHS":
|
||||
case "6MONTHS":
|
||||
manifestation.setAccessRight(AccessRight.EMBARGO.label);
|
||||
break;
|
||||
default:
|
||||
manifestation.setAccessRight(AccessRight.UNAVAILABLE.label);
|
||||
}
|
||||
if (Optional.ofNullable(i.getLicense()).isPresent())
|
||||
manifestation.setLicence(i.getLicense().getValue());
|
||||
// TODO to fill the biblio in case it is a journal, we need to join with the datasource and verify the type
|
||||
Biblio biblio = null;
|
||||
manifestation.setHostingDatasource(i.getHostedby().getKey());
|
||||
// TODO verify if the result is published in ojournal or conferences. In that case the venue is the identifier
|
||||
// of the journal/conference. In case it is not, the venue is the datasource
|
||||
if (biblio == null) {
|
||||
manifestation.setVenue(i.getHostedby().getKey());
|
||||
} else {
|
||||
manifestation.setVenue("insert the id of the venue");
|
||||
}
|
||||
return manifestation;
|
||||
}
|
||||
|
||||
private static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
|
||||
if (!Optional.ofNullable(pid).isPresent())
|
||||
return null;
|
||||
if (pid.size() == 0)
|
||||
return null;
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.TRUE);
|
||||
}
|
||||
}
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapContribution(ResearchProduct out, E input) {
|
||||
if (Optional.ofNullable(input.getAuthor()).isPresent()) {
|
||||
int count = 0;
|
||||
|
@ -172,14 +67,14 @@ public class ResultMapper implements Serializable {
|
|||
count += 1;
|
||||
Contribution contribution = new Contribution();
|
||||
if (Optional.ofNullable(a.getPid()).isPresent()) {
|
||||
Tuple2<String, Boolean> orcid = getOrcid(a.getPid());
|
||||
Tuple2<String, Boolean> orcid = Utils.getOrcid(a.getPid());
|
||||
if (orcid != null) {
|
||||
contribution.setPerson("person______::"+DHPUtils.md5(orcid._1() + orcid._2()));
|
||||
contribution.setPerson("person______::" + DHPUtils.md5(orcid._1() + orcid._2()));
|
||||
} else {
|
||||
if (Optional.ofNullable(a.getRank()).isPresent()) {
|
||||
contribution.setPerson("person______::"+DHPUtils.md5(input.getId() + a.getRank()));
|
||||
contribution.setPerson("person______::" + DHPUtils.md5(input.getId() + a.getRank()));
|
||||
} else {
|
||||
contribution.setPerson("tmp_person__::"+DHPUtils.md5(input.getId() + count));
|
||||
contribution.setPerson("tmp_person__::" + DHPUtils.md5(input.getId() + count));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -187,32 +82,31 @@ public class ResultMapper implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
// "contributions": [
|
||||
// {
|
||||
// "person": "person_123",
|
||||
// "declared_affiliations": ["org_1", "org_3"],
|
||||
// "rank": 1,
|
||||
// "roles": ["writing-original-draft", "conceptualization"]
|
||||
// }
|
||||
// ]
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapTopic(ResearchProduct out, E input) {
|
||||
if (Optional.ofNullable(input.getSubject()).isPresent()) {
|
||||
out.setTopics(input.getSubject().stream().parallel().map(s -> {
|
||||
Topic topic = new Topic();
|
||||
out
|
||||
.setTopics(
|
||||
input
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(s -> !s.getQualifier().getClassid().equalsIgnoreCase("keyword"))
|
||||
.map(s -> {
|
||||
ResultTopic topic = new ResultTopic();
|
||||
topic.setTopic(getIdentifier(s));
|
||||
Provenance provenance = new Provenance();
|
||||
provenance.setTrust(Double.valueOf(s.getDataInfo().getTrust()));
|
||||
provenance.setType(s.getDataInfo().getInferenceprovenance());
|
||||
topic.setProvenance(provenance);
|
||||
return topic;
|
||||
}).collect(Collectors.toList()));
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
private static String getIdentifier(StructuredProperty s) {
|
||||
return DHPUtils.md5(s.getQualifier().getClassid() + s.getValue());
|
||||
return "topic_______::" + DHPUtils.md5(s.getQualifier().getClassid() + s.getValue());
|
||||
}
|
||||
|
||||
private static <E extends Result> void mapType(ResearchProduct out, E input) throws NoAllowedTypeException {
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class Utils implements Serializable {
|
||||
|
||||
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
|
||||
if (!Optional.ofNullable(pid).isPresent())
|
||||
return null;
|
||||
if (pid.size() == 0)
|
||||
return null;
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.TRUE);
|
||||
}
|
||||
}
|
||||
for (StructuredProperty p : pid) {
|
||||
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
|
||||
return new Tuple2<>(p.getValue(), Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.beans;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.skgif.model.Biblio;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 15/02/24
|
||||
*/
|
||||
public class EmitPerManifestation implements Serializable {
|
||||
private String resultId;
|
||||
private String hostedBy;
|
||||
private Journal journal;
|
||||
private Instance instance;
|
||||
private String publisher;
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getHostedBy() {
|
||||
return hostedBy;
|
||||
}
|
||||
|
||||
public void setHostedBy(String hostedBy) {
|
||||
this.hostedBy = hostedBy;
|
||||
}
|
||||
|
||||
public Journal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(Journal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public Instance getInstance() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
public void setInstance(Instance instance) {
|
||||
this.instance = instance;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.beans;
|
||||
|
||||
import eu.dnetlib.dhp.skgif.model.ResearchProduct;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class PartialResearchProduct extends ResearchProduct {
|
||||
private String resultId;
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
package eu.dnetlib.dhp.skgif.beans;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.skgif.model.Relations;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 16/02/24
|
||||
*/
|
||||
public class RelationPerProduct implements Serializable {
|
||||
|
||||
private String resultId;
|
||||
private List<String> organizations;
|
||||
private List<String> funding;
|
||||
private List<Relations> relatedProduct;
|
||||
|
||||
public RelationPerProduct() {
|
||||
organizations = new ArrayList<>();
|
||||
funding = new ArrayList<>();
|
||||
relatedProduct = new ArrayList<>();
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public List<String> getOrganizations() {
|
||||
return organizations;
|
||||
}
|
||||
|
||||
public void setOrganizations(List<String> organizations) {
|
||||
this.organizations = organizations;
|
||||
}
|
||||
|
||||
public List<String> getFunding() {
|
||||
return funding;
|
||||
}
|
||||
|
||||
public void setFunding(List<String> funding) {
|
||||
this.funding = funding;
|
||||
}
|
||||
|
||||
public List<Relations> getRelatedProduct() {
|
||||
return relatedProduct;
|
||||
}
|
||||
|
||||
public void setRelatedProduct(List<Relations> relatedProduct) {
|
||||
this.relatedProduct = relatedProduct;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,783 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.skgif;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Constants;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob;
|
||||
import eu.dnetlib.dhp.oa.model.Instance;
|
||||
import eu.dnetlib.dhp.oa.model.OpenAccessRoute;
|
||||
import eu.dnetlib.dhp.oa.model.Score;
|
||||
import eu.dnetlib.dhp.oa.model.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.oa.model.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.skgif.EmitFromResults;
|
||||
import eu.dnetlib.dhp.skgif.beans.EmitPerManifestation;
|
||||
import eu.dnetlib.dhp.skgif.model.Manifestation;
|
||||
import eu.dnetlib.dhp.skgif.model.Persons;
|
||||
import eu.dnetlib.dhp.skgif.model.ResultTopic;
|
||||
|
||||
//@Disabled
|
||||
public class EmitFromResultJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EmitFromResultJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(EmitFromResultJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(EmitFromResultJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(EmitFromResultJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmitFromResult() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/publication_extendedinstance")
|
||||
.getPath();
|
||||
|
||||
EmitFromResults
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-outputPath", workingDir.toString() + "/result/",
|
||||
"-workingDir", workingDir.toString() + "/"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Persons> persons = sc
|
||||
.textFile(workingDir.toString() + "/result/Persons")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Persons.class));
|
||||
|
||||
JavaRDD<ResultTopic> topics = sc
|
||||
.textFile(workingDir.toString() + "/result/Topic")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, ResultTopic.class));
|
||||
|
||||
JavaRDD<EmitPerManifestation> manifestation = sc
|
||||
.textFile(workingDir.toString() + "/result/Persons")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, EmitPerManifestation.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<Persons> personsDataset = spark
|
||||
.createDataset(persons.rdd(), Encoders.bean(Persons.class));
|
||||
|
||||
personsDataset.show(false);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDatasetDump() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/dataset_extendedinstance")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("type = 'dataset'").count());
|
||||
|
||||
// the common fields in the result have been already checked. Now checking only
|
||||
// community specific fields
|
||||
|
||||
GraphResult gr = verificationDataset.first();
|
||||
|
||||
Assertions.assertEquals(2, gr.getGeolocation().size());
|
||||
Assertions.assertEquals(2, gr.getGeolocation().stream().filter(gl -> gl.getBox().equals("")).count());
|
||||
Assertions.assertEquals(1, gr.getGeolocation().stream().filter(gl -> gl.getPlace().equals("")).count());
|
||||
Assertions.assertEquals(1, gr.getGeolocation().stream().filter(gl -> gl.getPoint().equals("")).count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
gr
|
||||
.getGeolocation()
|
||||
.stream()
|
||||
.filter(gl -> gl.getPlace().equals("18 York St, Ottawa, ON K1N 5S6; Ottawa; Ontario; Canada"))
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, gr.getGeolocation().stream().filter(gl -> gl.getPoint().equals("45.427242 -75.693904")).count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
gr
|
||||
.getGeolocation()
|
||||
.stream()
|
||||
.filter(gl -> gl.getPoint().equals("") && !gl.getPlace().equals(""))
|
||||
.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1,
|
||||
gr
|
||||
.getGeolocation()
|
||||
.stream()
|
||||
.filter(gl -> !gl.getPoint().equals("") && gl.getPlace().equals(""))
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals("1024Gb", gr.getSize());
|
||||
|
||||
Assertions.assertEquals("1.01", gr.getVersion());
|
||||
|
||||
Assertions.assertEquals(null, gr.getContainer());
|
||||
Assertions.assertEquals(null, gr.getCodeRepositoryUrl());
|
||||
Assertions.assertEquals(null, gr.getProgrammingLanguage());
|
||||
Assertions.assertEquals(null, gr.getDocumentationUrl());
|
||||
Assertions.assertEquals(null, gr.getContactperson());
|
||||
Assertions.assertEquals(null, gr.getContactgroup());
|
||||
Assertions.assertEquals(null, gr.getTool());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSoftwareDump() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/software_extendedinstance")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("type = 'software'").count());
|
||||
|
||||
GraphResult gr = verificationDataset.first();
|
||||
|
||||
Assertions.assertEquals(2, gr.getDocumentationUrl().size());
|
||||
Assertions.assertTrue(gr.getDocumentationUrl().contains("doc_url_1"));
|
||||
Assertions.assertTrue(gr.getDocumentationUrl().contains("doc_url_2"));
|
||||
|
||||
Assertions.assertEquals("code_repo", gr.getCodeRepositoryUrl());
|
||||
|
||||
Assertions.assertEquals("perl", gr.getProgrammingLanguage());
|
||||
|
||||
Assertions.assertEquals(null, gr.getContainer());
|
||||
Assertions.assertEquals(null, gr.getContactperson());
|
||||
Assertions.assertEquals(null, gr.getContactgroup());
|
||||
Assertions.assertEquals(null, gr.getTool());
|
||||
Assertions.assertEquals(null, gr.getGeolocation());
|
||||
Assertions.assertEquals(null, gr.getSize());
|
||||
Assertions.assertEquals(null, gr.getVersion());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOrpDump() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/orp_extendedinstance")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("type = 'other'").count());
|
||||
|
||||
GraphResult gr = verificationDataset.first();
|
||||
|
||||
Assertions.assertEquals(2, gr.getContactperson().size());
|
||||
Assertions.assertTrue(gr.getContactperson().contains(("contact_person1")));
|
||||
Assertions.assertTrue(gr.getContactperson().contains(("contact_person2")));
|
||||
|
||||
Assertions.assertEquals(1, gr.getContactgroup().size());
|
||||
Assertions.assertTrue(gr.getContactgroup().contains(("contact_group")));
|
||||
|
||||
Assertions.assertEquals(2, gr.getTool().size());
|
||||
Assertions.assertTrue(gr.getTool().contains("tool1"));
|
||||
Assertions.assertTrue(gr.getTool().contains("tool2"));
|
||||
|
||||
Assertions.assertEquals(null, gr.getContainer());
|
||||
Assertions.assertEquals(null, gr.getDocumentationUrl());
|
||||
Assertions.assertEquals(null, gr.getCodeRepositoryUrl());
|
||||
Assertions.assertEquals(null, gr.getProgrammingLanguage());
|
||||
Assertions.assertEquals(null, gr.getGeolocation());
|
||||
Assertions.assertEquals(null, gr.getSize());
|
||||
Assertions.assertEquals(null, gr.getVersion());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPublicationDumpCommunity() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/publication_extendedinstance")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(1, verificationDataset.filter("type = 'publication'").count());
|
||||
|
||||
// the common fields in the result have been already checked. Now checking only
|
||||
// community specific fields
|
||||
|
||||
CommunityResult cr = verificationDataset.first();
|
||||
|
||||
Assertions.assertEquals(1, cr.getContext().size());
|
||||
Assertions.assertEquals("dh-ch", cr.getContext().get(0).getCode());
|
||||
Assertions.assertEquals("Digital Humanities and Cultural Heritage", cr.getContext().get(0).getLabel());
|
||||
Assertions.assertEquals(1, cr.getContext().get(0).getProvenance().size());
|
||||
Assertions.assertEquals("Inferred by OpenAIRE", cr.getContext().get(0).getProvenance().get(0).getProvenance());
|
||||
Assertions.assertEquals("0.9", cr.getContext().get(0).getProvenance().get(0).getTrust());
|
||||
|
||||
Assertions.assertEquals(1, cr.getCollectedfrom().size());
|
||||
Assertions
|
||||
.assertEquals("openaire____::fdc7e0400d8c1634cdaf8051dbae23db", cr.getCollectedfrom().get(0).getKey());
|
||||
Assertions.assertEquals("Pensoft", cr.getCollectedfrom().get(0).getValue());
|
||||
|
||||
Assertions.assertEquals(1, cr.getInstance().size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"openaire____::fdc7e0400d8c1634cdaf8051dbae23db",
|
||||
cr.getInstance().get(0).getCollectedfrom().getKey());
|
||||
Assertions.assertEquals("Pensoft", cr.getInstance().get(0).getCollectedfrom().getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"openaire____::e707e544b9a5bd23fc27fbfa65eb60dd", cr.getInstance().get(0).getHostedby().getKey());
|
||||
Assertions.assertEquals("One Ecosystem", cr.getInstance().get(0).getHostedby().getValue());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDataset() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/dataset.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(90, verificationDataset.count());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
verificationDataset.filter("bestAccessright.code = 'c_abf2'").count() == verificationDataset
|
||||
.filter("bestAccessright.code = 'c_abf2' and bestAccessright.label = 'OPEN'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
verificationDataset.filter("bestAccessright.code = 'c_16ec'").count() == verificationDataset
|
||||
.filter("bestAccessright.code = 'c_16ec' and bestAccessright.label = 'RESTRICTED'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
verificationDataset.filter("bestAccessright.code = 'c_14cb'").count() == verificationDataset
|
||||
.filter("bestAccessright.code = 'c_14cb' and bestAccessright.label = 'CLOSED'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
verificationDataset.filter("bestAccessright.code = 'c_f1cf'").count() == verificationDataset
|
||||
.filter("bestAccessright.code = 'c_f1cf' and bestAccessright.label = 'EMBARGO'")
|
||||
.count());
|
||||
|
||||
Assertions.assertTrue(verificationDataset.filter("size(context) > 0").count() == 90);
|
||||
|
||||
Assertions.assertTrue(verificationDataset.filter("type = 'dataset'").count() == 90);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDataset2All() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/dataset_cleaned")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(5, verificationDataset.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDataset2Communities() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/dataset_cleaned")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(0, verificationDataset.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPublication() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/publication.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(74, verificationDataset.count());
|
||||
verificationDataset.show(false);
|
||||
|
||||
Assertions.assertEquals(74, verificationDataset.filter("type = 'publication'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSoftware() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/software.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(6, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(6, verificationDataset.filter("type = 'software'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testORP() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/orp.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(3, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(3, verificationDataset.filter("type = 'other'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecord() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/singelRecord_pub.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpCommunityProducts
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.count());
|
||||
verificationDataset.show(false);
|
||||
|
||||
Assertions.assertEquals(2, verificationDataset.filter("type = 'publication'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArticlePCA() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/publication_pca")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(23, verificationDataset.count());
|
||||
|
||||
Assertions.assertEquals(23, verificationDataset.filter("type = 'publication'").count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("check");
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> temp = spark
|
||||
.sql(
|
||||
"select id " +
|
||||
"from check " +
|
||||
"lateral view explode (instance) i as inst " +
|
||||
"where inst.articleprocessingcharge is not null");
|
||||
|
||||
Assertions.assertTrue(temp.count() == 2);
|
||||
|
||||
Assertions.assertTrue(temp.filter("id = 'datacite____::05c611fdfc93d7a2a703d1324e28104a'").count() == 1);
|
||||
|
||||
Assertions.assertTrue(temp.filter("id = 'dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'").count() == 1);
|
||||
|
||||
temp = spark
|
||||
.sql(
|
||||
"select id, inst.articleprocessingcharge.amount, inst.articleprocessingcharge.currency " +
|
||||
"from check " +
|
||||
"lateral view explode (instance) i as inst " +
|
||||
"where inst.articleprocessingcharge is not null");
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"3131.64",
|
||||
temp
|
||||
.filter("id = 'datacite____::05c611fdfc93d7a2a703d1324e28104a'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(1));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"EUR",
|
||||
temp
|
||||
.filter("id = 'datacite____::05c611fdfc93d7a2a703d1324e28104a'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(2));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"2578.35",
|
||||
temp
|
||||
.filter("id = 'dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(1));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"EUR",
|
||||
temp
|
||||
.filter("id = 'dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testresultNotDumped() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/resultNotDumped.json")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
SparkDumpEntitiesJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath", workingDir.toString() + "/result",
|
||||
"-communityMapPath", communityMapPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(0, tmp.count());
|
||||
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue