added generation of Scholix collection

This commit is contained in:
Sandro La Bruzzo 2020-02-26 12:09:06 +01:00
parent 2ef3705b2c
commit 7936583a3d
4 changed files with 103 additions and 33 deletions

View File

@ -39,19 +39,14 @@ public class SparkGenerateScholix {
final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted);
final JavaPairRDD<String,String> scholixSummary = sc.textFile(workingDirPath + "/summary").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
PairFunction<Tuple2<String, String>, String, Scholix> k =
summaryRelation ->
new Tuple2<>(
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2()));
scholixSummary.join(
relationToExport
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
.map(Tuple2::_2)
.mapToPair(k)
.mapToPair(summaryRelation ->
new Tuple2<>(
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2())))
.join(scholixSummary)
.map(Tuple2::_2)
.map(i -> i._1().addTarget(i._2()))

View File

@ -3,10 +3,8 @@ package eu.dnetlib.dhp.provision.scholix;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
@ -32,25 +30,39 @@ public class Scholix implements Serializable {
try {
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
Relation rel = mapper.readValue(sourceSummaryJson, Relation.class);
Relation rel = mapper.readValue(relation, Relation.class);
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null)
s.setPublicationDate(scholixSummary.getDate().stream().findFirst().orElse(null));
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
new ScholixEntityId(cf.getValue(), Collections.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
))).collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
s.setSource(ScholixResource.fromSummary(scholixSummary));
return s;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
private void generateIdentifier( ) {
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
}
public Scholix addTarget(final String targetSummaryJson) {
return this;
final ObjectMapper mapper = new ObjectMapper();
try {
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
setTarget(ScholixResource.fromSummary(targetSummary));
generateIdentifier();
return this;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}

View File

@ -1,26 +1,70 @@
package eu.dnetlib.dhp.provision.scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class ScholixResource implements Serializable {
private ScholixIdentifier identifier ;
private String dnetIdentifier ;
private String objectType ;
private String objectSubType ;
private String title ;
private List<ScholixEntityId> creator ;
private String publicationDate ;
private List<ScholixEntityId> publisher ;
private List<ScholixCollectedFrom> collectedFrom ;
private List<ScholixIdentifier> identifier;
private String dnetIdentifier;
private String objectType;
private String objectSubType;
private String title;
private List<ScholixEntityId> creator;
private String publicationDate;
private List<ScholixEntityId> publisher;
private List<ScholixCollectedFrom> collectedFrom;
public ScholixIdentifier getIdentifier() {
public static ScholixResource fromSummary(ScholixSummary summary) {
final ScholixResource resource = new ScholixResource();
resource.setDnetIdentifier(summary.getId());
resource.setIdentifier(summary.getLocalIdentifier().stream()
.map(i ->
new ScholixIdentifier(i.getId(), i.getType()))
.collect(Collectors.toList()));
resource.setObjectType(summary.getTypology().toString());
resource.setTitle(summary.getTitle().stream().findAny().orElse(null));
if (summary.getAuthor() != null)
resource.setCreator(summary.getAuthor().stream()
.map(c -> new ScholixEntityId(c, null))
.collect(Collectors.toList())
);
if (summary.getDate() != null)
resource.setPublicationDate(summary.getDate().stream().findAny().orElse(null));
if (summary.getPublisher() != null)
resource.setPublisher(summary.getPublisher().stream()
.map(p -> new ScholixEntityId(p, null))
.collect(Collectors.toList())
);
if (summary.getDatasources() != null)
resource.setCollectedFrom(summary.getDatasources().stream()
.map(d ->
new ScholixCollectedFrom(new ScholixEntityId(d.getDatasourceName(),
Collections.singletonList(new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))
), "collected", d.getCompletionStatus()))
.collect(Collectors.toList()));
return resource;
}
public List<ScholixIdentifier> getIdentifier() {
return identifier;
}
public ScholixResource setIdentifier(ScholixIdentifier identifier) {
public ScholixResource setIdentifier(List<ScholixIdentifier> identifier) {
this.identifier = identifier;
return this;
}

View File

@ -27,7 +27,7 @@
</parameters>
<start to="indexSummary"/>
<start to="generateScholix"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -91,10 +91,29 @@
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
<arg>--index</arg><arg>${index}_object</arg>
</spark>
<ok to="End"/>
<ok to="generateScholix"/>
<error to="Kill"/>
</action>
<action name="generateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Summary</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="indexSummary"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>