forked from antonis.lempesis/dnet-hadoop
added generation of Scholix collection
This commit is contained in:
parent
2ef3705b2c
commit
7936583a3d
|
@ -39,19 +39,14 @@ public class SparkGenerateScholix {
|
|||
|
||||
final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted);
|
||||
final JavaPairRDD<String,String> scholixSummary = sc.textFile(workingDirPath + "/summary").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
|
||||
|
||||
|
||||
PairFunction<Tuple2<String, String>, String, Scholix> k =
|
||||
summaryRelation ->
|
||||
new Tuple2<>(
|
||||
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
|
||||
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2()));
|
||||
|
||||
scholixSummary.join(
|
||||
relationToExport
|
||||
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
|
||||
.map(Tuple2::_2)
|
||||
.mapToPair(k)
|
||||
.mapToPair(summaryRelation ->
|
||||
new Tuple2<>(
|
||||
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
|
||||
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2())))
|
||||
.join(scholixSummary)
|
||||
.map(Tuple2::_2)
|
||||
.map(i -> i._1().addTarget(i._2()))
|
||||
|
|
|
@ -3,10 +3,8 @@ package eu.dnetlib.dhp.provision.scholix;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -32,25 +30,39 @@ public class Scholix implements Serializable {
|
|||
|
||||
try {
|
||||
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
|
||||
Relation rel = mapper.readValue(sourceSummaryJson, Relation.class);
|
||||
Relation rel = mapper.readValue(relation, Relation.class);
|
||||
final Scholix s = new Scholix();
|
||||
if (scholixSummary.getDate() != null)
|
||||
s.setPublicationDate(scholixSummary.getDate().stream().findFirst().orElse(null));
|
||||
|
||||
|
||||
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
|
||||
new ScholixEntityId(cf.getValue(), Collections.singletonList(
|
||||
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
|
||||
))).collect(Collectors.toList()));
|
||||
|
||||
|
||||
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
|
||||
s.setSource(ScholixResource.fromSummary(scholixSummary));
|
||||
return s;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void generateIdentifier( ) {
|
||||
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
|
||||
|
||||
}
|
||||
|
||||
public Scholix addTarget(final String targetSummaryJson) {
|
||||
return this;
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try {
|
||||
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
|
||||
setTarget(ScholixResource.fromSummary(targetSummary));
|
||||
generateIdentifier();
|
||||
return this;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,26 +1,70 @@
|
|||
package eu.dnetlib.dhp.provision.scholix;
|
||||
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class ScholixResource implements Serializable {
|
||||
|
||||
private ScholixIdentifier identifier ;
|
||||
private String dnetIdentifier ;
|
||||
private String objectType ;
|
||||
private String objectSubType ;
|
||||
private String title ;
|
||||
private List<ScholixEntityId> creator ;
|
||||
private String publicationDate ;
|
||||
private List<ScholixEntityId> publisher ;
|
||||
private List<ScholixCollectedFrom> collectedFrom ;
|
||||
private List<ScholixIdentifier> identifier;
|
||||
private String dnetIdentifier;
|
||||
private String objectType;
|
||||
private String objectSubType;
|
||||
private String title;
|
||||
private List<ScholixEntityId> creator;
|
||||
private String publicationDate;
|
||||
private List<ScholixEntityId> publisher;
|
||||
private List<ScholixCollectedFrom> collectedFrom;
|
||||
|
||||
|
||||
public ScholixIdentifier getIdentifier() {
|
||||
public static ScholixResource fromSummary(ScholixSummary summary) {
|
||||
|
||||
final ScholixResource resource = new ScholixResource();
|
||||
|
||||
resource.setDnetIdentifier(summary.getId());
|
||||
|
||||
resource.setIdentifier(summary.getLocalIdentifier().stream()
|
||||
.map(i ->
|
||||
new ScholixIdentifier(i.getId(), i.getType()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
resource.setObjectType(summary.getTypology().toString());
|
||||
|
||||
resource.setTitle(summary.getTitle().stream().findAny().orElse(null));
|
||||
|
||||
if (summary.getAuthor() != null)
|
||||
resource.setCreator(summary.getAuthor().stream()
|
||||
.map(c -> new ScholixEntityId(c, null))
|
||||
.collect(Collectors.toList())
|
||||
);
|
||||
|
||||
if (summary.getDate() != null)
|
||||
resource.setPublicationDate(summary.getDate().stream().findAny().orElse(null));
|
||||
if (summary.getPublisher() != null)
|
||||
resource.setPublisher(summary.getPublisher().stream()
|
||||
.map(p -> new ScholixEntityId(p, null))
|
||||
.collect(Collectors.toList())
|
||||
);
|
||||
if (summary.getDatasources() != null)
|
||||
resource.setCollectedFrom(summary.getDatasources().stream()
|
||||
.map(d ->
|
||||
new ScholixCollectedFrom(new ScholixEntityId(d.getDatasourceName(),
|
||||
Collections.singletonList(new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))
|
||||
), "collected", d.getCompletionStatus()))
|
||||
.collect(Collectors.toList()));
|
||||
return resource;
|
||||
|
||||
}
|
||||
|
||||
public List<ScholixIdentifier> getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public ScholixResource setIdentifier(ScholixIdentifier identifier) {
|
||||
public ScholixResource setIdentifier(List<ScholixIdentifier> identifier) {
|
||||
this.identifier = identifier;
|
||||
return this;
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
|
||||
</parameters>
|
||||
|
||||
<start to="indexSummary"/>
|
||||
<start to="generateScholix"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -91,10 +91,29 @@
|
|||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
||||
<arg>--index</arg><arg>${index}_object</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<ok to="generateScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<action name="generateScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>generate Summary</name>
|
||||
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||
</spark>
|
||||
<ok to="indexSummary"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
|
||||
</workflow-app>
|
Loading…
Reference in New Issue