code formatting

This commit is contained in:
Claudio Atzori 2023-10-06 12:31:17 +02:00
parent 73c49b8d26
commit eed9fe0902
3 changed files with 133 additions and 133 deletions

View File

@ -33,7 +33,7 @@ import scala.Tuple2;
public class GroupEntitiesSparkJob { public class GroupEntitiesSparkJob {
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class); private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
private static final Encoder<OafEntity> OAFENTITY_KRYO_ENC = Encoders.kryo(OafEntity.class); private static final Encoder<OafEntity> OAFENTITY_KRYO_ENC = Encoders.kryo(OafEntity.class);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -114,7 +114,7 @@ public class GroupEntitiesSparkJob {
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC)); Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
// pivot on "_1" (classname of the entity) // pivot on "_1" (classname of the entity)
// created columns containing only entities of the same class // created columns containing only entities of the same class
for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) { for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) {
String entity = e.getKey().name(); String entity = e.getKey().name();
Class<? extends OafEntity> entityClass = e.getValue(); Class<? extends OafEntity> entityClass = e.getValue();

View File

@ -67,60 +67,60 @@ public class SparkPropagateRelation extends AbstractSparkAction {
log.info("graphOutputPath: '{}'", graphOutputPath); log.info("graphOutputPath: '{}'", graphOutputPath);
Dataset<Relation> mergeRels = spark Dataset<Relation> mergeRels = spark
.read() .read()
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) .load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
.as(REL_BEAN_ENC); .as(REL_BEAN_ENC);
// <mergedObjectID, dedupID> // <mergedObjectID, dedupID>
Dataset<Row> idsToMerge = mergeRels Dataset<Row> idsToMerge = mergeRels
.where(col("relClass").equalTo(ModelConstants.MERGES)) .where(col("relClass").equalTo(ModelConstants.MERGES))
.select(col("source").as("dedupID"), col("target").as("mergedObjectID")) .select(col("source").as("dedupID"), col("target").as("mergedObjectID"))
.distinct(); .distinct();
Dataset<Row> allRels = spark Dataset<Row> allRels = spark
.read() .read()
.schema(REL_BEAN_ENC.schema()) .schema(REL_BEAN_ENC.schema())
.json(graphBasePath + "/relation"); .json(graphBasePath + "/relation");
Dataset<Relation> dedupedRels = allRels Dataset<Relation> dedupedRels = allRels
.joinWith(idsToMerge, allRels.col("source").equalTo(idsToMerge.col("mergedObjectID")), "left_outer") .joinWith(idsToMerge, allRels.col("source").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
.joinWith(idsToMerge, col("_1.target").equalTo(idsToMerge.col("mergedObjectID")), "left_outer") .joinWith(idsToMerge, col("_1.target").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
.select("_1._1", "_1._2.dedupID", "_2.dedupID") .select("_1._1", "_1._2.dedupID", "_2.dedupID")
.as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING())) .as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING()))
.map((MapFunction<Tuple3<Relation, String, String>, Relation>) t -> { .map((MapFunction<Tuple3<Relation, String, String>, Relation>) t -> {
Relation rel = t._1(); Relation rel = t._1();
String newSource = t._2(); String newSource = t._2();
String newTarget = t._3(); String newTarget = t._3();
if (rel.getDataInfo() == null) { if (rel.getDataInfo() == null) {
rel.setDataInfo(new DataInfo()); rel.setDataInfo(new DataInfo());
} }
if (newSource != null || newTarget != null) { if (newSource != null || newTarget != null) {
rel.getDataInfo().setDeletedbyinference(false); rel.getDataInfo().setDeletedbyinference(false);
if (newSource != null) if (newSource != null)
rel.setSource(newSource); rel.setSource(newSource);
if (newTarget != null) if (newTarget != null)
rel.setTarget(newTarget); rel.setTarget(newTarget);
} }
return rel; return rel;
}, REL_BEAN_ENC); }, REL_BEAN_ENC);
// ids of records that are both not deletedbyinference and not invisible // ids of records that are both not deletedbyinference and not invisible
Dataset<Row> ids = validIds(spark, graphBasePath); Dataset<Row> ids = validIds(spark, graphBasePath);
// filter relations that point to valid records, can force them to be visible // filter relations that point to valid records, can force them to be visible
Dataset<Relation> cleanedRels = dedupedRels Dataset<Relation> cleanedRels = dedupedRels
.join(ids, col("source").equalTo(ids.col("id")), "leftsemi") .join(ids, col("source").equalTo(ids.col("id")), "leftsemi")
.join(ids, col("target").equalTo(ids.col("id")), "leftsemi") .join(ids, col("target").equalTo(ids.col("id")), "leftsemi")
.as(REL_BEAN_ENC) .as(REL_BEAN_ENC)
.map((MapFunction<Relation, Relation>) r -> { .map((MapFunction<Relation, Relation>) r -> {
r.getDataInfo().setInvisible(false); r.getDataInfo().setInvisible(false);
return r; return r;
}, REL_KRYO_ENC); }, REL_KRYO_ENC);
Dataset<Relation> distinctRels = cleanedRels Dataset<Relation> distinctRels = cleanedRels
.groupByKey( .groupByKey(

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.oa.graph.group; package eu.dnetlib.dhp.oa.graph.group;
import com.fasterxml.jackson.databind.DeserializationFeature; import static org.junit.jupiter.api.Assertions.assertEquals;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport; import java.io.IOException;
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob; import java.net.URISyntaxException;
import eu.dnetlib.dhp.schema.common.ModelSupport; import java.nio.file.Files;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import java.nio.file.Path;
import eu.dnetlib.dhp.schema.oaf.Result; import java.nio.file.Paths;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -18,108 +18,108 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import java.io.IOException; import com.fasterxml.jackson.databind.DeserializationFeature;
import java.net.URISyntaxException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import static org.junit.jupiter.api.Assertions.assertEquals; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.DHPUtils;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class GroupEntitiesSparkJobTest { public class GroupEntitiesSparkJobTest {
private static SparkSession spark; private static SparkSession spark;
private static ObjectMapper mapper = new ObjectMapper() private static ObjectMapper mapper = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
private static Path workingDir; private static Path workingDir;
private Path dataInputPath; private Path dataInputPath;
private Path checkpointPath; private Path checkpointPath;
private Path outputPath; private Path outputPath;
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(GroupEntitiesSparkJob.class.getSimpleName()); workingDir = Files.createTempDirectory(GroupEntitiesSparkJob.class.getSimpleName());
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(GroupEntitiesSparkJob.class.getSimpleName()); conf.setAppName(GroupEntitiesSparkJob.class.getSimpleName());
conf.setMaster("local"); conf.setMaster("local");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ModelSupport.getOafModelClasses()); conf.registerKryoClasses(ModelSupport.getOafModelClasses());
spark = SparkSession.builder().config(conf).getOrCreate(); spark = SparkSession.builder().config(conf).getOrCreate();
} }
@BeforeEach @BeforeEach
public void beforeEach() throws IOException, URISyntaxException { public void beforeEach() throws IOException, URISyntaxException {
dataInputPath = Paths.get(ClassLoader.getSystemResource("eu/dnetlib/dhp/oa/graph/group").toURI()); dataInputPath = Paths.get(ClassLoader.getSystemResource("eu/dnetlib/dhp/oa/graph/group").toURI());
checkpointPath = workingDir.resolve("grouped_entity"); checkpointPath = workingDir.resolve("grouped_entity");
outputPath = workingDir.resolve("dispatched_entity"); outputPath = workingDir.resolve("dispatched_entity");
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
spark.stop(); spark.stop();
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
} }
@Test @Test
@Order(1) @Order(1)
void testGroupEntities() throws Exception { void testGroupEntities() throws Exception {
GroupEntitiesSparkJob.main(new String[]{ GroupEntitiesSparkJob.main(new String[] {
"-isSparkSessionManaged", "-isSparkSessionManaged",
Boolean.FALSE.toString(), Boolean.FALSE.toString(),
"-graphInputPath", "-graphInputPath",
dataInputPath.toString(), dataInputPath.toString(),
"-checkpointPath", "-checkpointPath",
checkpointPath.toString(), checkpointPath.toString(),
"-outputPath", "-outputPath",
outputPath.toString(), outputPath.toString(),
"-filterInvisible", "-filterInvisible",
Boolean.FALSE.toString() Boolean.FALSE.toString()
}); });
Dataset<OafEntity> checkpointTable = spark Dataset<OafEntity> checkpointTable = spark
.read() .read()
.load(checkpointPath.toString()) .load(checkpointPath.toString())
.selectExpr("COALESCE(*)") .selectExpr("COALESCE(*)")
.as(Encoders.kryo(OafEntity.class)); .as(Encoders.kryo(OafEntity.class));
assertEquals(
1,
checkpointTable
.filter(
(FilterFunction<OafEntity>) r -> "50|doi_________::09821844208a5cd6300b2bfb13bca1b9"
.equals(r.getId()) &&
r.getCollectedfrom().stream().anyMatch(kv -> kv.getValue().equalsIgnoreCase("zenodo")))
.count());
assertEquals( Dataset<Result> output = spark
1, .read()
checkpointTable .textFile(
.filter( DHPUtils
(FilterFunction<OafEntity>) r -> "50|doi_________::09821844208a5cd6300b2bfb13bca1b9" .toSeq(
.equals(r.getId()) && HdfsSupport
r.getCollectedfrom().stream().anyMatch(kv -> kv.getValue().equalsIgnoreCase("zenodo"))) .listFiles(outputPath.toString(), spark.sparkContext().hadoopConfiguration())))
.count()); .map((MapFunction<String, Result>) s -> mapper.readValue(s, Result.class), Encoders.bean(Result.class));
assertEquals(3, output.count());
Dataset<Result> output = spark assertEquals(
.read() 2,
.textFile( output
DHPUtils .map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
.toSeq( .filter((FilterFunction<String>) s -> s.equals("publication"))
HdfsSupport .count());
.listFiles(outputPath.toString(), spark.sparkContext().hadoopConfiguration()))) assertEquals(
.map((MapFunction<String, Result>) s -> mapper.readValue(s, Result.class), Encoders.bean(Result.class)); 1,
output
assertEquals(3, output.count()); .map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
assertEquals( .filter((FilterFunction<String>) s -> s.equals("dataset"))
2, .count());
output }
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING()) }
.filter((FilterFunction<String>) s -> s.equals("publication"))
.count());
assertEquals(
1,
output
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
.filter((FilterFunction<String>) s -> s.equals("dataset"))
.count());
}
}