forked from D-Net/dnet-hadoop
cleanup
This commit is contained in:
parent
9374ff03ea
commit
a2938dd059
|
@ -27,15 +27,15 @@ public class DedupRecordFactory {
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
public static <T extends OafEntity> Dataset<T> createDedupRecord(
|
public static <T extends OafEntity> Dataset<T> createDedupRecord(
|
||||||
final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final Class<T> clazz, final DedupConfig dedupConf) {
|
final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final Class<T> clazz) {
|
||||||
|
|
||||||
long ts = System.currentTimeMillis();
|
long ts = System.currentTimeMillis();
|
||||||
|
|
||||||
//<id, json_entity>
|
//<id, json_entity>
|
||||||
Dataset<Tuple2<String, T>> entities = spark.read()
|
Dataset<Tuple2<String, T>> entities = spark.read()
|
||||||
.textFile(entitiesInputPath)
|
.textFile(entitiesInputPath)
|
||||||
.map((MapFunction<String, Tuple2<String, T>>) it -> {
|
.map((MapFunction<String, Tuple2<String, T>>) s -> {
|
||||||
T entity = OBJECT_MAPPER.readValue(it, clazz);
|
T entity = OBJECT_MAPPER.readValue(s, clazz);
|
||||||
return new Tuple2<>(entity.getId(), entity);
|
return new Tuple2<>(entity.getId(), entity);
|
||||||
}, Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
}, Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||||
|
|
||||||
|
|
|
@ -10,10 +10,7 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
@ -70,7 +67,7 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
|
||||||
|
|
||||||
Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
||||||
|
|
||||||
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz, dedupConf)
|
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz)
|
||||||
.map((MapFunction<OafEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
|
.map((MapFunction<OafEntity, String>) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING())
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
|
@ -95,7 +95,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
|
SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
|
||||||
parser.parseArgument(new String[]{
|
parser.parseArgument(new String[]{
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", testGraphBasePath,
|
"-i", testGraphBasePath,
|
||||||
"-asi", testActionSetId,
|
"-asi", testActionSetId,
|
||||||
"-la", "lookupurl",
|
"-la", "lookupurl",
|
||||||
|
@ -120,7 +119,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkCreateMergeRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
SparkCreateMergeRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
||||||
parser.parseArgument(new String[]{
|
parser.parseArgument(new String[]{
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", testGraphBasePath,
|
"-i", testGraphBasePath,
|
||||||
"-asi", testActionSetId,
|
"-asi", testActionSetId,
|
||||||
"-la", "lookupurl",
|
"-la", "lookupurl",
|
||||||
|
@ -145,7 +143,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
|
SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
|
||||||
parser.parseArgument(new String[]{
|
parser.parseArgument(new String[]{
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", testGraphBasePath,
|
"-i", testGraphBasePath,
|
||||||
"-asi", testActionSetId,
|
"-asi", testActionSetId,
|
||||||
"-la", "lookupurl",
|
"-la", "lookupurl",
|
||||||
|
@ -170,7 +167,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
|
SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
|
||||||
parser.parseArgument(new String[]{
|
parser.parseArgument(new String[]{
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", testGraphBasePath,
|
"-i", testGraphBasePath,
|
||||||
"-w", testOutputBasePath,
|
"-w", testOutputBasePath,
|
||||||
"-o", testDedupGraphBasePath
|
"-o", testDedupGraphBasePath
|
||||||
|
@ -221,7 +217,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
|
SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
|
||||||
parser.parseArgument(new String[]{
|
parser.parseArgument(new String[]{
|
||||||
"-mt", "local[*]",
|
|
||||||
"-i", testGraphBasePath,
|
"-i", testGraphBasePath,
|
||||||
"-w", testOutputBasePath,
|
"-w", testOutputBasePath,
|
||||||
"-o", testDedupGraphBasePath
|
"-o", testDedupGraphBasePath
|
||||||
|
|
Loading…
Reference in New Issue