moved parameter file. Added 40| as prefix on projects for computing the delta

This commit is contained in:
Miriam Baglioni 2023-08-04 17:18:15 +02:00
parent 097905171a
commit 5fb58362c5
3 changed files with 9 additions and 2 deletions

View File

@ -176,7 +176,10 @@ public class QueryInformationSystem {
for (Object node : el.selectNodes(".//param")) {
Node n = (Node) node;
if (n.valueOf("./@name").equals("openaireId")) {
return prefix + "|" + n.getText();
String id = n.getText();
if (id.startsWith(prefix + "|"))
return id;
return prefix + "|" + id;
}
}

View File

@ -58,7 +58,11 @@ public class ProjectsSubsetSparkJob implements Serializable {
String projectListPath) {
Dataset<String> projectList = spark.read().textFile(projectListPath);
Dataset<Project> projects;
projects = Utils.readPath(spark, inputPath, Project.class);
projects = Utils.readPath(spark, inputPath, Project.class)
.map((MapFunction<Project, Project>) p -> {
p.setId("40|" + p.getId());
return p;
}, Encoders.bean(Project.class));
projects
.joinWith(projectList, projects.col("id").equalTo(projectList.col("value")), "left")
.map((MapFunction<Tuple2<Project, String>, Project>) t2 -> {