forked from D-Net/dnet-hadoop
code formatting
This commit is contained in:
parent
73c49b8d26
commit
eed9fe0902
|
@ -33,7 +33,7 @@ import scala.Tuple2;
|
||||||
public class GroupEntitiesSparkJob {
|
public class GroupEntitiesSparkJob {
|
||||||
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
|
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
|
||||||
|
|
||||||
private static final Encoder<OafEntity> OAFENTITY_KRYO_ENC = Encoders.kryo(OafEntity.class);
|
private static final Encoder<OafEntity> OAFENTITY_KRYO_ENC = Encoders.kryo(OafEntity.class);
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ public class GroupEntitiesSparkJob {
|
||||||
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
|
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
|
||||||
|
|
||||||
// pivot on "_1" (classname of the entity)
|
// pivot on "_1" (classname of the entity)
|
||||||
// created columns containing only entities of the same class
|
// created columns containing only entities of the same class
|
||||||
for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) {
|
for (Map.Entry<EntityType, Class> e : ModelSupport.entityTypes.entrySet()) {
|
||||||
String entity = e.getKey().name();
|
String entity = e.getKey().name();
|
||||||
Class<? extends OafEntity> entityClass = e.getValue();
|
Class<? extends OafEntity> entityClass = e.getValue();
|
||||||
|
|
|
@ -67,60 +67,60 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
log.info("graphOutputPath: '{}'", graphOutputPath);
|
log.info("graphOutputPath: '{}'", graphOutputPath);
|
||||||
|
|
||||||
Dataset<Relation> mergeRels = spark
|
Dataset<Relation> mergeRels = spark
|
||||||
.read()
|
.read()
|
||||||
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
.load(DedupUtility.createMergeRelPath(workingPath, "*", "*"))
|
||||||
.as(REL_BEAN_ENC);
|
.as(REL_BEAN_ENC);
|
||||||
|
|
||||||
// <mergedObjectID, dedupID>
|
// <mergedObjectID, dedupID>
|
||||||
Dataset<Row> idsToMerge = mergeRels
|
Dataset<Row> idsToMerge = mergeRels
|
||||||
.where(col("relClass").equalTo(ModelConstants.MERGES))
|
.where(col("relClass").equalTo(ModelConstants.MERGES))
|
||||||
.select(col("source").as("dedupID"), col("target").as("mergedObjectID"))
|
.select(col("source").as("dedupID"), col("target").as("mergedObjectID"))
|
||||||
.distinct();
|
.distinct();
|
||||||
|
|
||||||
Dataset<Row> allRels = spark
|
Dataset<Row> allRels = spark
|
||||||
.read()
|
.read()
|
||||||
.schema(REL_BEAN_ENC.schema())
|
.schema(REL_BEAN_ENC.schema())
|
||||||
.json(graphBasePath + "/relation");
|
.json(graphBasePath + "/relation");
|
||||||
|
|
||||||
Dataset<Relation> dedupedRels = allRels
|
Dataset<Relation> dedupedRels = allRels
|
||||||
.joinWith(idsToMerge, allRels.col("source").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
|
.joinWith(idsToMerge, allRels.col("source").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
|
||||||
.joinWith(idsToMerge, col("_1.target").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
|
.joinWith(idsToMerge, col("_1.target").equalTo(idsToMerge.col("mergedObjectID")), "left_outer")
|
||||||
.select("_1._1", "_1._2.dedupID", "_2.dedupID")
|
.select("_1._1", "_1._2.dedupID", "_2.dedupID")
|
||||||
.as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING()))
|
.as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING()))
|
||||||
.map((MapFunction<Tuple3<Relation, String, String>, Relation>) t -> {
|
.map((MapFunction<Tuple3<Relation, String, String>, Relation>) t -> {
|
||||||
Relation rel = t._1();
|
Relation rel = t._1();
|
||||||
String newSource = t._2();
|
String newSource = t._2();
|
||||||
String newTarget = t._3();
|
String newTarget = t._3();
|
||||||
|
|
||||||
if (rel.getDataInfo() == null) {
|
if (rel.getDataInfo() == null) {
|
||||||
rel.setDataInfo(new DataInfo());
|
rel.setDataInfo(new DataInfo());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newSource != null || newTarget != null) {
|
if (newSource != null || newTarget != null) {
|
||||||
rel.getDataInfo().setDeletedbyinference(false);
|
rel.getDataInfo().setDeletedbyinference(false);
|
||||||
|
|
||||||
if (newSource != null)
|
if (newSource != null)
|
||||||
rel.setSource(newSource);
|
rel.setSource(newSource);
|
||||||
|
|
||||||
if (newTarget != null)
|
if (newTarget != null)
|
||||||
rel.setTarget(newTarget);
|
rel.setTarget(newTarget);
|
||||||
}
|
}
|
||||||
|
|
||||||
return rel;
|
return rel;
|
||||||
}, REL_BEAN_ENC);
|
}, REL_BEAN_ENC);
|
||||||
|
|
||||||
// ids of records that are both not deletedbyinference and not invisible
|
// ids of records that are both not deletedbyinference and not invisible
|
||||||
Dataset<Row> ids = validIds(spark, graphBasePath);
|
Dataset<Row> ids = validIds(spark, graphBasePath);
|
||||||
|
|
||||||
// filter relations that point to valid records, can force them to be visible
|
// filter relations that point to valid records, can force them to be visible
|
||||||
Dataset<Relation> cleanedRels = dedupedRels
|
Dataset<Relation> cleanedRels = dedupedRels
|
||||||
.join(ids, col("source").equalTo(ids.col("id")), "leftsemi")
|
.join(ids, col("source").equalTo(ids.col("id")), "leftsemi")
|
||||||
.join(ids, col("target").equalTo(ids.col("id")), "leftsemi")
|
.join(ids, col("target").equalTo(ids.col("id")), "leftsemi")
|
||||||
.as(REL_BEAN_ENC)
|
.as(REL_BEAN_ENC)
|
||||||
.map((MapFunction<Relation, Relation>) r -> {
|
.map((MapFunction<Relation, Relation>) r -> {
|
||||||
r.getDataInfo().setInvisible(false);
|
r.getDataInfo().setInvisible(false);
|
||||||
return r;
|
return r;
|
||||||
}, REL_KRYO_ENC);
|
}, REL_KRYO_ENC);
|
||||||
|
|
||||||
Dataset<Relation> distinctRels = cleanedRels
|
Dataset<Relation> distinctRels = cleanedRels
|
||||||
.groupByKey(
|
.groupByKey(
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.group;
|
package eu.dnetlib.dhp.oa.graph.group;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob;
|
import java.net.URISyntaxException;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import java.nio.file.Files;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import java.nio.file.Path;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import java.nio.file.Paths;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -18,108 +18,108 @@ import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import java.net.URISyntaxException;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class GroupEntitiesSparkJobTest {
|
public class GroupEntitiesSparkJobTest {
|
||||||
|
|
||||||
private static SparkSession spark;
|
private static SparkSession spark;
|
||||||
|
|
||||||
private static ObjectMapper mapper = new ObjectMapper()
|
private static ObjectMapper mapper = new ObjectMapper()
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
private Path dataInputPath;
|
private Path dataInputPath;
|
||||||
|
|
||||||
private Path checkpointPath;
|
private Path checkpointPath;
|
||||||
|
|
||||||
private Path outputPath;
|
private Path outputPath;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void beforeAll() throws IOException {
|
public static void beforeAll() throws IOException {
|
||||||
workingDir = Files.createTempDirectory(GroupEntitiesSparkJob.class.getSimpleName());
|
workingDir = Files.createTempDirectory(GroupEntitiesSparkJob.class.getSimpleName());
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.setAppName(GroupEntitiesSparkJob.class.getSimpleName());
|
conf.setAppName(GroupEntitiesSparkJob.class.getSimpleName());
|
||||||
conf.setMaster("local");
|
conf.setMaster("local");
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||||
spark = SparkSession.builder().config(conf).getOrCreate();
|
spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void beforeEach() throws IOException, URISyntaxException {
|
public void beforeEach() throws IOException, URISyntaxException {
|
||||||
dataInputPath = Paths.get(ClassLoader.getSystemResource("eu/dnetlib/dhp/oa/graph/group").toURI());
|
dataInputPath = Paths.get(ClassLoader.getSystemResource("eu/dnetlib/dhp/oa/graph/group").toURI());
|
||||||
checkpointPath = workingDir.resolve("grouped_entity");
|
checkpointPath = workingDir.resolve("grouped_entity");
|
||||||
outputPath = workingDir.resolve("dispatched_entity");
|
outputPath = workingDir.resolve("dispatched_entity");
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
public static void afterAll() throws IOException {
|
public static void afterAll() throws IOException {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(1)
|
@Order(1)
|
||||||
void testGroupEntities() throws Exception {
|
void testGroupEntities() throws Exception {
|
||||||
GroupEntitiesSparkJob.main(new String[]{
|
GroupEntitiesSparkJob.main(new String[] {
|
||||||
"-isSparkSessionManaged",
|
"-isSparkSessionManaged",
|
||||||
Boolean.FALSE.toString(),
|
Boolean.FALSE.toString(),
|
||||||
"-graphInputPath",
|
"-graphInputPath",
|
||||||
dataInputPath.toString(),
|
dataInputPath.toString(),
|
||||||
"-checkpointPath",
|
"-checkpointPath",
|
||||||
checkpointPath.toString(),
|
checkpointPath.toString(),
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
outputPath.toString(),
|
outputPath.toString(),
|
||||||
"-filterInvisible",
|
"-filterInvisible",
|
||||||
Boolean.FALSE.toString()
|
Boolean.FALSE.toString()
|
||||||
});
|
});
|
||||||
|
|
||||||
Dataset<OafEntity> checkpointTable = spark
|
Dataset<OafEntity> checkpointTable = spark
|
||||||
.read()
|
.read()
|
||||||
.load(checkpointPath.toString())
|
.load(checkpointPath.toString())
|
||||||
.selectExpr("COALESCE(*)")
|
.selectExpr("COALESCE(*)")
|
||||||
.as(Encoders.kryo(OafEntity.class));
|
.as(Encoders.kryo(OafEntity.class));
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
1,
|
||||||
|
checkpointTable
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<OafEntity>) r -> "50|doi_________::09821844208a5cd6300b2bfb13bca1b9"
|
||||||
|
.equals(r.getId()) &&
|
||||||
|
r.getCollectedfrom().stream().anyMatch(kv -> kv.getValue().equalsIgnoreCase("zenodo")))
|
||||||
|
.count());
|
||||||
|
|
||||||
assertEquals(
|
Dataset<Result> output = spark
|
||||||
1,
|
.read()
|
||||||
checkpointTable
|
.textFile(
|
||||||
.filter(
|
DHPUtils
|
||||||
(FilterFunction<OafEntity>) r -> "50|doi_________::09821844208a5cd6300b2bfb13bca1b9"
|
.toSeq(
|
||||||
.equals(r.getId()) &&
|
HdfsSupport
|
||||||
r.getCollectedfrom().stream().anyMatch(kv -> kv.getValue().equalsIgnoreCase("zenodo")))
|
.listFiles(outputPath.toString(), spark.sparkContext().hadoopConfiguration())))
|
||||||
.count());
|
.map((MapFunction<String, Result>) s -> mapper.readValue(s, Result.class), Encoders.bean(Result.class));
|
||||||
|
|
||||||
|
assertEquals(3, output.count());
|
||||||
Dataset<Result> output = spark
|
assertEquals(
|
||||||
.read()
|
2,
|
||||||
.textFile(
|
output
|
||||||
DHPUtils
|
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
|
||||||
.toSeq(
|
.filter((FilterFunction<String>) s -> s.equals("publication"))
|
||||||
HdfsSupport
|
.count());
|
||||||
.listFiles(outputPath.toString(), spark.sparkContext().hadoopConfiguration())))
|
assertEquals(
|
||||||
.map((MapFunction<String, Result>) s -> mapper.readValue(s, Result.class), Encoders.bean(Result.class));
|
1,
|
||||||
|
output
|
||||||
assertEquals(3, output.count());
|
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
|
||||||
assertEquals(
|
.filter((FilterFunction<String>) s -> s.equals("dataset"))
|
||||||
2,
|
.count());
|
||||||
output
|
}
|
||||||
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
|
|
||||||
.filter((FilterFunction<String>) s -> s.equals("publication"))
|
|
||||||
.count());
|
|
||||||
assertEquals(
|
|
||||||
1,
|
|
||||||
output
|
|
||||||
.map((MapFunction<Result, String>) r -> r.getResulttype().getClassid(), Encoders.STRING())
|
|
||||||
.filter((FilterFunction<String>) s -> s.equals("dataset"))
|
|
||||||
.count());
|
|
||||||
}
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue