forked from antonis.lempesis/dnet-hadoop
added generation of Scholix collection
This commit is contained in:
parent
2ef3705b2c
commit
7936583a3d
|
@ -39,19 +39,14 @@ public class SparkGenerateScholix {
|
||||||
|
|
||||||
final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted);
|
final JavaRDD<String> relationToExport = sc.textFile(graphPath + "/relation").filter(ProvisionUtil::isNotDeleted);
|
||||||
final JavaPairRDD<String,String> scholixSummary = sc.textFile(workingDirPath + "/summary").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
|
final JavaPairRDD<String,String> scholixSummary = sc.textFile(workingDirPath + "/summary").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
|
||||||
|
|
||||||
|
|
||||||
PairFunction<Tuple2<String, String>, String, Scholix> k =
|
|
||||||
summaryRelation ->
|
|
||||||
new Tuple2<>(
|
|
||||||
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
|
|
||||||
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2()));
|
|
||||||
|
|
||||||
scholixSummary.join(
|
scholixSummary.join(
|
||||||
relationToExport
|
relationToExport
|
||||||
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
|
.mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(sourceIDPath, i), i)))
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.mapToPair(k)
|
.mapToPair(summaryRelation ->
|
||||||
|
new Tuple2<>(
|
||||||
|
DHPUtils.getJPathString(targetIDPath,summaryRelation._2()),
|
||||||
|
Scholix.generateScholixWithSource(summaryRelation._1(), summaryRelation._2())))
|
||||||
.join(scholixSummary)
|
.join(scholixSummary)
|
||||||
.map(Tuple2::_2)
|
.map(Tuple2::_2)
|
||||||
.map(i -> i._1().addTarget(i._2()))
|
.map(i -> i._1().addTarget(i._2()))
|
||||||
|
|
|
@ -3,10 +3,8 @@ package eu.dnetlib.dhp.provision.scholix;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -32,25 +30,39 @@ public class Scholix implements Serializable {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
|
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
|
||||||
Relation rel = mapper.readValue(sourceSummaryJson, Relation.class);
|
Relation rel = mapper.readValue(relation, Relation.class);
|
||||||
final Scholix s = new Scholix();
|
final Scholix s = new Scholix();
|
||||||
if (scholixSummary.getDate() != null)
|
if (scholixSummary.getDate() != null)
|
||||||
s.setPublicationDate(scholixSummary.getDate().stream().findFirst().orElse(null));
|
s.setPublicationDate(scholixSummary.getDate().stream().findFirst().orElse(null));
|
||||||
|
|
||||||
|
|
||||||
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
|
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
|
||||||
new ScholixEntityId(cf.getValue(), Collections.singletonList(
|
new ScholixEntityId(cf.getValue(), Collections.singletonList(
|
||||||
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
|
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
|
||||||
))).collect(Collectors.toList()));
|
))).collect(Collectors.toList()));
|
||||||
|
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
|
||||||
|
s.setSource(ScholixResource.fromSummary(scholixSummary));
|
||||||
|
return s;
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void generateIdentifier( ) {
|
||||||
|
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public Scholix addTarget(final String targetSummaryJson) {
|
public Scholix addTarget(final String targetSummaryJson) {
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
try {
|
||||||
|
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
|
||||||
|
setTarget(ScholixResource.fromSummary(targetSummary));
|
||||||
|
generateIdentifier();
|
||||||
return this;
|
return this;
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,70 @@
|
||||||
package eu.dnetlib.dhp.provision.scholix;
|
package eu.dnetlib.dhp.provision.scholix;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class ScholixResource implements Serializable {
|
public class ScholixResource implements Serializable {
|
||||||
|
|
||||||
private ScholixIdentifier identifier ;
|
private List<ScholixIdentifier> identifier;
|
||||||
private String dnetIdentifier ;
|
private String dnetIdentifier;
|
||||||
private String objectType ;
|
private String objectType;
|
||||||
private String objectSubType ;
|
private String objectSubType;
|
||||||
private String title ;
|
private String title;
|
||||||
private List<ScholixEntityId> creator ;
|
private List<ScholixEntityId> creator;
|
||||||
private String publicationDate ;
|
private String publicationDate;
|
||||||
private List<ScholixEntityId> publisher ;
|
private List<ScholixEntityId> publisher;
|
||||||
private List<ScholixCollectedFrom> collectedFrom ;
|
private List<ScholixCollectedFrom> collectedFrom;
|
||||||
|
|
||||||
|
|
||||||
public ScholixIdentifier getIdentifier() {
|
public static ScholixResource fromSummary(ScholixSummary summary) {
|
||||||
|
|
||||||
|
final ScholixResource resource = new ScholixResource();
|
||||||
|
|
||||||
|
resource.setDnetIdentifier(summary.getId());
|
||||||
|
|
||||||
|
resource.setIdentifier(summary.getLocalIdentifier().stream()
|
||||||
|
.map(i ->
|
||||||
|
new ScholixIdentifier(i.getId(), i.getType()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
resource.setObjectType(summary.getTypology().toString());
|
||||||
|
|
||||||
|
resource.setTitle(summary.getTitle().stream().findAny().orElse(null));
|
||||||
|
|
||||||
|
if (summary.getAuthor() != null)
|
||||||
|
resource.setCreator(summary.getAuthor().stream()
|
||||||
|
.map(c -> new ScholixEntityId(c, null))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
|
||||||
|
if (summary.getDate() != null)
|
||||||
|
resource.setPublicationDate(summary.getDate().stream().findAny().orElse(null));
|
||||||
|
if (summary.getPublisher() != null)
|
||||||
|
resource.setPublisher(summary.getPublisher().stream()
|
||||||
|
.map(p -> new ScholixEntityId(p, null))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
);
|
||||||
|
if (summary.getDatasources() != null)
|
||||||
|
resource.setCollectedFrom(summary.getDatasources().stream()
|
||||||
|
.map(d ->
|
||||||
|
new ScholixCollectedFrom(new ScholixEntityId(d.getDatasourceName(),
|
||||||
|
Collections.singletonList(new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))
|
||||||
|
), "collected", d.getCompletionStatus()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
return resource;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<ScholixIdentifier> getIdentifier() {
|
||||||
return identifier;
|
return identifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScholixResource setIdentifier(ScholixIdentifier identifier) {
|
public ScholixResource setIdentifier(List<ScholixIdentifier> identifier) {
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="indexSummary"/>
|
<start to="generateScholix"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -91,10 +91,29 @@
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
||||||
<arg>--index</arg><arg>${index}_object</arg>
|
<arg>--index</arg><arg>${index}_object</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="generateScholix"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<action name="generateScholix">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>generate Summary</name>
|
||||||
|
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
|
||||||
|
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
|
||||||
|
<arg>--graphPath</arg><arg>${graphPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="indexSummary"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
Loading…
Reference in New Issue