enrichment steps #38
|
@ -75,17 +75,27 @@ public class PrepareProjects {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void exec(SparkSession spark, String progjectPath, String dbProjectPath, String outputPath) {
|
private static void exec(SparkSession spark, String projectPath, String dbProjectPath, String outputPath) {
|
||||||
Dataset<CSVProject> project = readPath(spark, progjectPath, CSVProject.class);
|
Dataset<CSVProject> project = readPath(spark, projectPath, CSVProject.class);
|
||||||
Dataset<ProjectSubset> dbProjects = readPath(spark, dbProjectPath, ProjectSubset.class);
|
Dataset<ProjectSubset> dbProjects = readPath(spark, dbProjectPath, ProjectSubset.class);
|
||||||
|
|
||||||
dbProjects.joinWith(project, dbProjects.col("code").equalTo(project.col("id")), "left")
|
dbProjects
|
||||||
.flatMap((FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
.joinWith(project, dbProjects.col("code").equalTo(project.col("id")), "left")
|
||||||
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
.flatMap(getTuple2CSVProjectFlatMapFunction(), Encoders.bean(CSVProject.class))
|
||||||
if(! csvProject.isPresent()){
|
.filter(Objects::nonNull)
|
||||||
return null;
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject> getTuple2CSVProjectFlatMapFunction() {
|
||||||
|
return (FlatMapFunction<Tuple2<ProjectSubset, CSVProject>, CSVProject>) value -> {
|
||||||
|
Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
||||||
List<CSVProject> csvProjectList = new ArrayList<>();
|
List<CSVProject> csvProjectList = new ArrayList<>();
|
||||||
|
if (csvProject.isPresent()) {
|
||||||
|
|
||||||
String[] programme = csvProject.get().getProgramme().split(";");
|
String[] programme = csvProject.get().getProgramme().split(";");
|
||||||
Arrays
|
Arrays
|
||||||
.stream(programme)
|
.stream(programme)
|
||||||
|
@ -95,36 +105,9 @@ public class PrepareProjects {
|
||||||
proj.setId(csvProject.get().getId());
|
proj.setId(csvProject.get().getId());
|
||||||
csvProjectList.add(proj);
|
csvProjectList.add(proj);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
return csvProjectList.iterator();
|
return csvProjectList.iterator();
|
||||||
}, Encoders.bean(CSVProject.class))
|
};
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
//
|
|
||||||
// .map(value -> {
|
|
||||||
// Optional<CSVProject> csvProject = Optional.ofNullable(value._2());
|
|
||||||
// }, Encoders.bean(CSVProject.class))
|
|
||||||
// .filter(Objects::nonNull)
|
|
||||||
// .toJavaRDD()
|
|
||||||
// .flatMap(p -> {
|
|
||||||
// List<CSVProject> csvProjectList = new ArrayList<>();
|
|
||||||
// String[] programme = p.getProgramme().split(";");
|
|
||||||
// Arrays
|
|
||||||
// .stream(programme)
|
|
||||||
// .forEach(value -> {
|
|
||||||
// CSVProject csvProject = new CSVProject();
|
|
||||||
// csvProject.setProgramme(value);
|
|
||||||
// csvProject.setId(p.getId());
|
|
||||||
// csvProjectList.add(csvProject);
|
|
||||||
// });
|
|
||||||
//
|
|
||||||
// return csvProjectList.iterator();
|
|
||||||
// })
|
|
||||||
// .map(p -> OBJECT_MAPPER.writeValueAsString(p))
|
|
||||||
// .saveAsTextFile(outputPath);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
|
|
|
@ -1,15 +1,5 @@
|
||||||
package eu.dnetlib.dhp.actionmanager.project;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
package eu.dnetlib.dhp.actionmanager.project;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.common.DbClient;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
|
@ -22,6 +12,19 @@ import java.util.List;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.DbClient;
|
||||||
|
|
||||||
public class ReadProjectsFromDB implements Closeable {
|
public class ReadProjectsFromDB implements Closeable {
|
||||||
|
|
||||||
private final DbClient dbClient;
|
private final DbClient dbClient;
|
||||||
|
@ -52,11 +55,12 @@ public class ReadProjectsFromDB implements Closeable {
|
||||||
try (final ReadProjectsFromDB rbl = new ReadProjectsFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
try (final ReadProjectsFromDB rbl = new ReadProjectsFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||||
dbPassword)) {
|
dbPassword)) {
|
||||||
|
|
||||||
log.info("Processing blacklist...");
|
log.info("Processing projects...");
|
||||||
rbl.execute(query, rbl::processProjectsEntry);
|
rbl.execute(query, rbl::processProjectsEntry);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void execute(final String sql, final Function<ResultSet, List<ProjectSubset>> producer) throws Exception {
|
public void execute(final String sql, final Function<ResultSet, List<ProjectSubset>> producer) throws Exception {
|
||||||
|
|
||||||
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeProject(r));
|
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeProject(r));
|
||||||
|
@ -100,7 +104,6 @@ public class ReadProjectsFromDB implements Closeable {
|
||||||
}
|
}
|
||||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||||
|
|
||||||
|
|
||||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,4 +113,3 @@ public class ReadProjectsFromDB implements Closeable {
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareProgramme</name>
|
<name>PrepareProjects</name>
|
||||||
<class>eu.dnetlib.dhp.actionmanager.project.PrepareProjects</class>
|
<class>eu.dnetlib.dhp.actionmanager.project.PrepareProjects</class>
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -111,6 +111,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--projectPath</arg><arg>${workingDir}/projects</arg>
|
<arg>--projectPath</arg><arg>${workingDir}/projects</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
||||||
|
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="create_updates"/>
|
<ok to="create_updates"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -67,7 +67,7 @@ public class PrepareProjectTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void numberDistinctProgrammeTest() throws Exception {
|
public void numberDistinctProjectTest() throws Exception {
|
||||||
PrepareProjects
|
PrepareProjects
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
|
@ -76,7 +76,10 @@ public class PrepareProjectTest {
|
||||||
"-projectPath",
|
"-projectPath",
|
||||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/projects_subset.json").getPath(),
|
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/projects_subset.json").getPath(),
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/preparedProjects"
|
workingDir.toString() + "/preparedProjects",
|
||||||
|
"-dbProjectPath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/dbProject").getPath(),
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
@ -85,7 +88,7 @@ public class PrepareProjectTest {
|
||||||
.textFile(workingDir.toString() + "/preparedProjects")
|
.textFile(workingDir.toString() + "/preparedProjects")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, CSVProject.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, CSVProject.class));
|
||||||
|
|
||||||
Assertions.assertEquals(20, tmp.count());
|
Assertions.assertEquals(8, tmp.count());
|
||||||
|
|
||||||
Dataset<CSVProject> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProject.class));
|
Dataset<CSVProject> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProject.class));
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -21,6 +20,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
|
||||||
public class SparkUpdateProjectTest {
|
public class SparkUpdateProjectTest {
|
||||||
|
@ -86,13 +86,10 @@ public class SparkUpdateProjectTest {
|
||||||
JavaRDD<Project> tmp = sc
|
JavaRDD<Project> tmp = sc
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Project)aa.getPayload()))
|
.map(aa -> ((Project) aa.getPayload()));
|
||||||
;
|
|
||||||
|
|
||||||
Assertions.assertEquals(14, tmp.count());
|
Assertions.assertEquals(14, tmp.count());
|
||||||
|
|
||||||
// Dataset<CSVProgramme> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(CSVProgramme.class));
|
|
||||||
//
|
|
||||||
// Assertions.assertEquals(0, verificationDataset.filter("shortTitle =''").count());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
{"code":"894593"}
|
||||||
|
{"code":"897004"}
|
||||||
|
{"code":"896300"}
|
||||||
|
{"code":"892890"}
|
||||||
|
{"code":"886828"}
|
||||||
|
{"code":"8867767"}
|
||||||
|
{"code":"101003374"}
|
||||||
|
{"code":"886776"}
|
Loading…
Reference in New Issue