This commit is contained in:
parent
81b55dc20b
commit
6b113961c1
|
@ -155,7 +155,7 @@ public class ResultMapper implements Serializable {
|
||||||
input
|
input
|
||||||
.getCollectedfrom()
|
.getCollectedfrom()
|
||||||
.stream()
|
.stream()
|
||||||
.map(cf -> CfHbKeyValue.newInstance(cf.getKey(), cf.getValue()))
|
.map(cf -> CfHbKeyValue.newInstance(cf.getKey().substring(3), cf.getValue()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -518,11 +518,11 @@ public class ResultMapper implements Serializable {
|
||||||
instance
|
instance
|
||||||
.setCollectedfrom(
|
.setCollectedfrom(
|
||||||
CfHbKeyValue
|
CfHbKeyValue
|
||||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
.newInstance(i.getCollectedfrom().getKey().substring(3), i.getCollectedfrom().getValue()));
|
||||||
|
|
||||||
instance
|
instance
|
||||||
.setHostedby(
|
.setHostedby(
|
||||||
CfHbKeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
CfHbKeyValue.newInstance(i.getHostedby().getKey().substring(3), i.getHostedby().getValue()));
|
||||||
|
|
||||||
return instance;
|
return instance;
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ public class Utils {
|
||||||
|
|
||||||
return String
|
return String
|
||||||
.format(
|
.format(
|
||||||
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
"%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
||||||
DHPUtils.md5(id));
|
DHPUtils.md5(id));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,13 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
Boolean substring = Optional
|
||||||
|
.ofNullable(parser.get("substring"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
@ -74,11 +81,12 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
prepareResultProjectList(spark, inputPath, outputPath);
|
prepareResultProjectList(spark, inputPath, outputPath, substring);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath,
|
||||||
|
Boolean substring) {
|
||||||
Dataset<Relation> relation = Utils
|
Dataset<Relation> relation = Utils
|
||||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||||
.filter(
|
.filter(
|
||||||
|
@ -101,7 +109,10 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
Set<String> projectSet = new HashSet<>();
|
Set<String> projectSet = new HashSet<>();
|
||||||
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
|
Tuple2<eu.dnetlib.dhp.schema.oaf.Project, Relation> first = it.next();
|
||||||
ResultProject rp = new ResultProject();
|
ResultProject rp = new ResultProject();
|
||||||
rp.setResultId(s);
|
if (substring)
|
||||||
|
rp.setResultId(s.substring(3));
|
||||||
|
else
|
||||||
|
rp.setResultId(s);
|
||||||
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
|
eu.dnetlib.dhp.schema.oaf.Project p = first._1();
|
||||||
projectSet.add(p.getId());
|
projectSet.add(p.getId());
|
||||||
Project ps = getProject(p, first._2);
|
Project ps = getProject(p, first._2);
|
||||||
|
@ -131,7 +142,7 @@ public class SparkPrepareResultProject implements Serializable {
|
||||||
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op, Relation relation) {
|
private static Project getProject(eu.dnetlib.dhp.schema.oaf.Project op, Relation relation) {
|
||||||
Project p = Project
|
Project p = Project
|
||||||
.newInstance(
|
.newInstance(
|
||||||
op.getId(),
|
op.getId().substring(3),
|
||||||
op.getCode().getValue(),
|
op.getCode().getValue(),
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(op.getAcronym())
|
.ofNullable(op.getAcronym())
|
||||||
|
|
|
@ -84,7 +84,7 @@ public class Extractor implements Serializable {
|
||||||
.orElse(null))
|
.orElse(null))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
Relation r = getRelation(
|
Relation r = getRelation(
|
||||||
value.getId().substring(3), contextId.substring(3),
|
value.getId().substring(3), contextId,
|
||||||
Constants.RESULT_ENTITY,
|
Constants.RESULT_ENTITY,
|
||||||
Constants.CONTEXT_ENTITY,
|
Constants.CONTEXT_ENTITY,
|
||||||
ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP, provenance);
|
ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP, provenance);
|
||||||
|
@ -94,7 +94,7 @@ public class Extractor implements Serializable {
|
||||||
hashCodes.add(r.hashCode());
|
hashCodes.add(r.hashCode());
|
||||||
}
|
}
|
||||||
r = getRelation(
|
r = getRelation(
|
||||||
contextId.substring(3), value.getId().substring(3),
|
contextId, value.getId().substring(3),
|
||||||
Constants.CONTEXT_ENTITY,
|
Constants.CONTEXT_ENTITY,
|
||||||
Constants.RESULT_ENTITY,
|
Constants.RESULT_ENTITY,
|
||||||
ModelConstants.IS_RELATED_TO,
|
ModelConstants.IS_RELATED_TO,
|
||||||
|
|
|
@ -55,8 +55,8 @@ public class Process implements Serializable {
|
||||||
ci
|
ci
|
||||||
.getDatasourceList()
|
.getDatasourceList()
|
||||||
.forEach(ds -> {
|
.forEach(ds -> {
|
||||||
|
|
||||||
String nodeType = ModelSupport.idPrefixEntity.get(ds.substring(0, 2));
|
String nodeType = ModelSupport.idPrefixEntity.get(ds.substring(0, 2));
|
||||||
|
String datasourceId = ds.startsWith("10|") || ds.startsWith("40|") ? ds.substring(3) : ds;
|
||||||
|
|
||||||
String contextId = Utils.getContextId(ci.getId());
|
String contextId = Utils.getContextId(ci.getId());
|
||||||
relationList
|
relationList
|
||||||
|
@ -65,7 +65,7 @@ public class Process implements Serializable {
|
||||||
.newInstance(
|
.newInstance(
|
||||||
|
|
||||||
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
||||||
ds, nodeType,
|
datasourceId, nodeType,
|
||||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||||
Provenance
|
Provenance
|
||||||
.newInstance(
|
.newInstance(
|
||||||
|
@ -76,7 +76,7 @@ public class Process implements Serializable {
|
||||||
.add(
|
.add(
|
||||||
Relation
|
Relation
|
||||||
.newInstance(
|
.newInstance(
|
||||||
ds, nodeType,
|
datasourceId, nodeType,
|
||||||
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
contextId, eu.dnetlib.dhp.oa.model.graph.Constants.CONTEXT_ENTITY,
|
||||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||||
Provenance
|
Provenance
|
||||||
|
|
|
@ -105,7 +105,7 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dataset<String> dumpedIds = Utils.getEntitiesId(spark, outputPath);
|
Dataset<String> dumpedIds = Utils.getEntitiesId(spark, outputPath);
|
||||||
|
|
||||||
Dataset<Relation> relations = Utils
|
Dataset<Relation> relations = Utils
|
||||||
.readPath(spark, inputPath + "/relation/publication", Relation.class)
|
.readPath(spark, inputPath + "/relation/publication", Relation.class)
|
||||||
|
@ -116,16 +116,18 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
.union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class))
|
.union(Utils.readPath(spark, inputPath + "/relation/context", Relation.class))
|
||||||
.union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class));
|
.union(Utils.readPath(spark, inputPath + "/relation/relation", Relation.class));
|
||||||
|
|
||||||
Utils.getValidRelations(relations, Utils.getEntitiesId(spark, outputPath))
|
// Utils.getValidRelations(relations, Utils.getEntitiesId(spark, outputPath))
|
||||||
// Dataset<Relation> relJoinSource = relations
|
Dataset<Relation> relJoinSource = relations
|
||||||
// .joinWith(dumpedIds, relations.col("source.id").equalTo(dumpedIds.col("value")))
|
.joinWith(dumpedIds, relations.col("source").equalTo(dumpedIds.col("value")))
|
||||||
// .map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
.map(
|
||||||
// Encoders.bean(Relation.class));
|
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
||||||
//
|
Encoders.bean(Relation.class));
|
||||||
// relJoinSource
|
|
||||||
// .joinWith(dumpedIds, relJoinSource.col("target.id").equalTo(dumpedIds.col("value")))
|
relJoinSource
|
||||||
// .map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
.joinWith(dumpedIds, relJoinSource.col("target").equalTo(dumpedIds.col("value")))
|
||||||
// Encoders.bean(Relation.class))
|
.map(
|
||||||
|
(MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(),
|
||||||
|
Encoders.bean(Relation.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -79,7 +79,9 @@ public class SparkDumpRelationJob implements Serializable {
|
||||||
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
|
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
|
||||||
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
|
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
|
||||||
relations
|
relations
|
||||||
.filter((FilterFunction<Relation>) r -> !removeSet.contains(r.getRelClass()))
|
.filter(
|
||||||
|
(FilterFunction<Relation>) r -> !removeSet.contains(r.getRelClass())
|
||||||
|
&& !r.getSubRelType().equalsIgnoreCase("resultService"))
|
||||||
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
|
||||||
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
|
||||||
relNew
|
relNew
|
||||||
|
|
|
@ -155,7 +155,7 @@ public class SparkOrganizationRelation implements Serializable {
|
||||||
eu.dnetlib.dhp.oa.model.graph.Relation
|
eu.dnetlib.dhp.oa.model.graph.Relation
|
||||||
.newInstance(
|
.newInstance(
|
||||||
id, Constants.CONTEXT_ENTITY,
|
id, Constants.CONTEXT_ENTITY,
|
||||||
organization,
|
organization.substring(3),
|
||||||
ModelSupport.idPrefixEntity.get(organization.substring(0, 2)),
|
ModelSupport.idPrefixEntity.get(organization.substring(0, 2)),
|
||||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||||
Provenance
|
Provenance
|
||||||
|
@ -167,7 +167,7 @@ public class SparkOrganizationRelation implements Serializable {
|
||||||
.add(
|
.add(
|
||||||
eu.dnetlib.dhp.oa.model.graph.Relation
|
eu.dnetlib.dhp.oa.model.graph.Relation
|
||||||
.newInstance(
|
.newInstance(
|
||||||
organization, ModelSupport.idPrefixEntity.get(organization.substring(0, 2)),
|
organization.substring(3), ModelSupport.idPrefixEntity.get(organization.substring(0, 2)),
|
||||||
id, Constants.CONTEXT_ENTITY,
|
id, Constants.CONTEXT_ENTITY,
|
||||||
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
RelType.newInstance(ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
|
||||||
Provenance
|
Provenance
|
||||||
|
|
|
@ -16,5 +16,11 @@
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "sb",
|
||||||
|
"paramLongName": "substring",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -102,6 +102,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
|
<arg>--substring</arg><arg>false</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_result_linked_to_projects"/>
|
<ok to="fork_result_linked_to_projects"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue