This commit is contained in:
parent
0f602bae9d
commit
db388ebc21
|
@ -17,6 +17,7 @@ import org.apache.spark.sql.Dataset;
|
|||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -76,8 +77,10 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, workingPath + resultType + "extendedaffiliation");
|
||||
Utils.removeOutputDir(spark, workingPath + resultType + "organization");
|
||||
Utils.removeOutputDir(spark, workingPath + resultType + "resultOrganization");
|
||||
addOrganizations(spark, inputPath, workingPath, resultType);
|
||||
dumpOrganizationAndRelations(spark, inputPath, workingPath, resultType);
|
||||
dumpOrganizationAndRelations(spark, inputPath, workingPath, resultType);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -101,6 +104,8 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
.joinWith(relation, result.col("id").equalTo(relation.col("source")))
|
||||
.map((MapFunction<Tuple2<Result, Relation>, Relation>) t2 -> t2._2(), Encoders.bean(Relation.class));
|
||||
|
||||
log.info("Number of affiliation relation for " + resultType + " = " + eoscRelation.count());
|
||||
|
||||
// from eoscRelation select the organization
|
||||
Dataset<String> organizationIds = eoscRelation
|
||||
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
||||
|
@ -121,7 +126,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + resultType + "/organization");
|
||||
.json(workingPath + resultType + "organization");
|
||||
|
||||
eoscRelation
|
||||
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
||||
|
@ -137,7 +142,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + resultType + "/resultOrganization");
|
||||
.json(workingPath + resultType + "resultOrganization");
|
||||
|
||||
}
|
||||
|
||||
|
@ -159,67 +164,18 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
|
||||
Dataset<ResultOrganizations> resultOrganization = relations
|
||||
.joinWith(organizations, relations.col("source").equalTo(organizations.col("id")), "left")
|
||||
.map((MapFunction<Tuple2<Relation, Organization>, ResultOrganizations>) t2 -> {
|
||||
if (t2._2() != null) {
|
||||
ResultOrganizations rOrg = new ResultOrganizations();
|
||||
rOrg.setResultId(t2._1().getTarget());
|
||||
Affiliation org = new Affiliation();
|
||||
org.setId(t2._2().getId());
|
||||
if (Optional.ofNullable(t2._2().getLegalname()).isPresent()) {
|
||||
org.setName(t2._2().getLegalname().getValue());
|
||||
} else {
|
||||
org.setName("");
|
||||
}
|
||||
HashMap<String, Set<String>> organizationPids = new HashMap<>();
|
||||
if (Optional.ofNullable(t2._2().getPid()).isPresent())
|
||||
t2._2().getPid().forEach(p -> {
|
||||
if (!organizationPids.containsKey(p.getQualifier().getClassid()))
|
||||
organizationPids.put(p.getQualifier().getClassid(), new HashSet<>());
|
||||
organizationPids.get(p.getQualifier().getClassid()).add(p.getValue());
|
||||
});
|
||||
List<OrganizationPid> pids = new ArrayList<>();
|
||||
for (String key : organizationPids.keySet()) {
|
||||
for (String value : organizationPids.get(key)) {
|
||||
OrganizationPid pid = new OrganizationPid();
|
||||
pid.setValue(value);
|
||||
pid.setType(key);
|
||||
pids.add(pid);
|
||||
}
|
||||
}
|
||||
org.setPid(pids);
|
||||
rOrg.setAffiliation(org);
|
||||
return rOrg;
|
||||
}
|
||||
return null;
|
||||
|
||||
}, Encoders.bean(ResultOrganizations.class))
|
||||
.map((MapFunction<Tuple2<Relation, Organization>, ResultOrganizations>) t2 -> getResultOrganizations(t2), Encoders.bean(ResultOrganizations.class))
|
||||
.filter(Objects::nonNull);
|
||||
|
||||
System.out.println(resultOrganization.count());
|
||||
|
||||
results
|
||||
.joinWith(resultOrganization, results.col("id").equalTo(resultOrganization.col("resultId")), "left")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<Result, ResultOrganizations>, String>) t2 -> t2._1().getId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Tuple2<Result, ResultOrganizations>, Result>) (s, it) -> {
|
||||
Tuple2<Result, ResultOrganizations> first = it.next();
|
||||
if (first._2() == null) {
|
||||
return first._1();
|
||||
}
|
||||
Result ret = first._1();
|
||||
List<Affiliation> affiliation = new ArrayList<>();
|
||||
Set<String> alreadyInsertedAffiliations = new HashSet<>();
|
||||
affiliation.add(first._2().getAffiliation());
|
||||
alreadyInsertedAffiliations.add(first._2().getAffiliation().getId());
|
||||
it.forEachRemaining(res -> {
|
||||
if (!alreadyInsertedAffiliations.contains(res._2().getAffiliation().getId())) {
|
||||
affiliation.add(res._2().getAffiliation());
|
||||
alreadyInsertedAffiliations.add(res._2().getAffiliation().getId());
|
||||
}
|
||||
|
||||
});
|
||||
ret.setAffiliation(affiliation);
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
(MapGroupsFunction<String, Tuple2<Result, ResultOrganizations>, Result>) (s, it) -> addAffiliation(it)
|
||||
, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
@ -227,6 +183,63 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private static ResultOrganizations getResultOrganizations(Tuple2<Relation, Organization> t2) {
|
||||
if (t2._2() != null) {
|
||||
Organization organization = t2._2();
|
||||
ResultOrganizations rOrg = new ResultOrganizations();
|
||||
rOrg.setResultId(t2._1().getTarget());
|
||||
Affiliation org = new Affiliation();
|
||||
org.setId(organization.getId());
|
||||
if (Optional.ofNullable(organization.getLegalname()).isPresent()) {
|
||||
org.setName(organization.getLegalname().getValue());
|
||||
} else {
|
||||
org.setName("");
|
||||
}
|
||||
HashMap<String, Set<String>> organizationPids = new HashMap<>();
|
||||
if (Optional.ofNullable(organization.getPid()).isPresent())
|
||||
organization.getPid().forEach(p -> {
|
||||
if (!organizationPids.containsKey(p.getQualifier().getClassid()))
|
||||
organizationPids.put(p.getQualifier().getClassid(), new HashSet<>());
|
||||
organizationPids.get(p.getQualifier().getClassid()).add(p.getValue());
|
||||
});
|
||||
List<OrganizationPid> pids = new ArrayList<>();
|
||||
for (String key : organizationPids.keySet()) {
|
||||
for (String value : organizationPids.get(key)) {
|
||||
OrganizationPid pid = new OrganizationPid();
|
||||
pid.setValue(value);
|
||||
pid.setType(key);
|
||||
pids.add(pid);
|
||||
}
|
||||
}
|
||||
org.setPid(pids);
|
||||
rOrg.setAffiliation(org);
|
||||
return rOrg;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Result addAffiliation(Iterator<Tuple2<Result, ResultOrganizations>> it) {
|
||||
Tuple2<Result, ResultOrganizations> first = it.next();
|
||||
if (first._2() == null) {
|
||||
return first._1();
|
||||
}
|
||||
Result ret = first._1();
|
||||
List<Affiliation> affiliation = new ArrayList<>();
|
||||
Set<String> alreadyInsertedAffiliations = new HashSet<>();
|
||||
affiliation.add(first._2().getAffiliation());
|
||||
alreadyInsertedAffiliations.add(first._2().getAffiliation().getId());
|
||||
it.forEachRemaining(res -> {
|
||||
if (!alreadyInsertedAffiliations.contains(res._2().getAffiliation().getId())) {
|
||||
affiliation.add(res._2().getAffiliation());
|
||||
alreadyInsertedAffiliations.add(res._2().getAffiliation().getId());
|
||||
}
|
||||
|
||||
});
|
||||
ret.setAffiliation(affiliation);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org) {
|
||||
|
||||
if (isToBeDumpedOrg(org))
|
||||
|
|
|
@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -95,6 +96,7 @@ public class SelectEoscResultsJobStep1 implements Serializable {
|
|||
(MapFunction<R, Result>) r -> (Result) ResultMapper
|
||||
.map(r, communityMap, df),
|
||||
Encoders.bean(Result.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
|
|
@ -10,6 +10,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -57,6 +58,9 @@ public class SparkDumpOrganizationProject implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
|
@ -64,16 +68,42 @@ public class SparkDumpOrganizationProject implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "/organizationProject");
|
||||
dumpRelation(spark, inputPath, outputPath);
|
||||
dumpRelation(spark, inputPath, outputPath, workingPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Organization> organization = Utils.readPath(spark, outputPath + "organization", Organization.class);
|
||||
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, String workingPath) {
|
||||
Dataset<Organization> organization = Utils
|
||||
.readPath(spark, workingPath + "publicationorganization", Organization.class)
|
||||
.union(Utils.readPath(spark, workingPath + "datasetorganization", Organization.class))
|
||||
.union(Utils.readPath(spark, workingPath + "softwareorganization", Organization.class))
|
||||
.union(Utils.readPath(spark, workingPath + "otherresearchproductorganization", Organization.class))
|
||||
.groupByKey((MapFunction<Organization, String>) o -> o.getId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Organization, Organization>) (k, v) -> v.next(),
|
||||
Encoders.bean(Organization.class));
|
||||
|
||||
Dataset<Project> project = Utils.readPath(spark, outputPath + "project", Project.class);
|
||||
Dataset<Project> project = Utils
|
||||
.readPath(spark, workingPath + "publicationproject", Project.class)
|
||||
.union(Utils.readPath(spark, workingPath + "datasetproject", Project.class))
|
||||
.union(Utils.readPath(spark, workingPath + "softwareproject", Project.class))
|
||||
.union(Utils.readPath(spark, workingPath + "otherresearchproductproject", Project.class))
|
||||
.groupByKey((MapFunction<Project, String>) o -> o.getId(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Project, Project>) (k, v) -> v.next(), Encoders.bean(Project.class));
|
||||
|
||||
organization
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "organization");
|
||||
|
||||
project
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "project");
|
||||
|
||||
Dataset<Relation> relation = Utils
|
||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||
|
@ -96,6 +126,42 @@ public class SparkDumpOrganizationProject implements Serializable {
|
|||
.option("compression", "gzip")
|
||||
.json(outputPath + "organizationProject");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "publicationresultOrganization", eu.dnetlib.dhp.eosc.model.Relation.class)
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingPath + "datasetresultOrganization", eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingPath + "softwareresultOrganization", eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingPath + "otherresearchproductresultOrganization",
|
||||
eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "resultOrganization");
|
||||
|
||||
Utils
|
||||
.readPath(spark, workingPath + "publicationresultProject", eu.dnetlib.dhp.eosc.model.Relation.class)
|
||||
.union(
|
||||
Utils.readPath(spark, workingPath + "datasetresultProject", eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.union(
|
||||
Utils.readPath(spark, workingPath + "softwareresultProject", eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.union(
|
||||
Utils
|
||||
.readPath(
|
||||
spark, workingPath + "otherresearchproductresultProject",
|
||||
eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "resultProject");
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -132,7 +132,7 @@ public class SparkUpdateProjectInfo implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "project");
|
||||
.json(workingPath + resultType + "project");
|
||||
|
||||
result
|
||||
.joinWith(
|
||||
|
@ -152,7 +152,7 @@ public class SparkUpdateProjectInfo implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "resultProject");
|
||||
.json(workingPath + resultType + "resultProject");
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.eosc.model.Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p)
|
||||
|
|
|
@ -632,6 +632,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dump/</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/dump/</arg>
|
||||
</spark>
|
||||
<ok to="make_archive"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue