forked from D-Net/dnet-hadoop
code formatting
This commit is contained in:
parent
bd187ec6e7
commit
6fd25cf549
|
@ -1,13 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.oozie;
|
||||
|
||||
import com.google.common.io.Resources;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.lang3.time.DurationFormatUtils;
|
||||
import org.apache.commons.text.StringSubstitutor;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
@ -15,7 +9,15 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
import org.apache.commons.lang3.time.DurationFormatUtils;
|
||||
import org.apache.commons.text.StringSubstitutor;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.io.Resources;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class RunSQLSparkJob {
|
||||
private static final Logger log = LoggerFactory.getLogger(RunSQLSparkJob.class);
|
||||
|
|
|
@ -1,6 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -8,180 +18,176 @@ import eu.dnetlib.dhp.schema.oaf.Author;
|
|||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import scala.Tuple2;
|
||||
import scala.Tuple3;
|
||||
import scala.collection.JavaConversions;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DedupRecordFactory {
|
||||
public static final class DedupRecordReduceState {
|
||||
public final String dedupId;
|
||||
public static final class DedupRecordReduceState {
|
||||
public final String dedupId;
|
||||
|
||||
public final ArrayList<String> aliases = new ArrayList<>();
|
||||
public final ArrayList<String> aliases = new ArrayList<>();
|
||||
|
||||
public final HashSet<String> acceptanceDate = new HashSet<>();
|
||||
public final HashSet<String> acceptanceDate = new HashSet<>();
|
||||
|
||||
public OafEntity entity;
|
||||
public OafEntity entity;
|
||||
|
||||
public DedupRecordReduceState(String dedupId, String id, OafEntity entity) {
|
||||
this.dedupId = dedupId;
|
||||
this.entity = entity;
|
||||
if (entity == null) {
|
||||
aliases.add(id);
|
||||
} else {
|
||||
if (Result.class.isAssignableFrom(entity.getClass())) {
|
||||
Result result = (Result) entity;
|
||||
if (result.getDateofacceptance() != null && StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
|
||||
acceptanceDate.add(result.getDateofacceptance().getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
public DedupRecordReduceState(String dedupId, String id, OafEntity entity) {
|
||||
this.dedupId = dedupId;
|
||||
this.entity = entity;
|
||||
if (entity == null) {
|
||||
aliases.add(id);
|
||||
} else {
|
||||
if (Result.class.isAssignableFrom(entity.getClass())) {
|
||||
Result result = (Result) entity;
|
||||
if (result.getDateofacceptance() != null
|
||||
&& StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
|
||||
acceptanceDate.add(result.getDateofacceptance().getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getDedupId() {
|
||||
return dedupId;
|
||||
}
|
||||
}
|
||||
private static final int MAX_ACCEPTANCE_DATE = 20;
|
||||
public String getDedupId() {
|
||||
return dedupId;
|
||||
}
|
||||
}
|
||||
|
||||
private DedupRecordFactory() {
|
||||
}
|
||||
private static final int MAX_ACCEPTANCE_DATE = 20;
|
||||
|
||||
public static Dataset<OafEntity> createDedupRecord(
|
||||
final SparkSession spark,
|
||||
final DataInfo dataInfo,
|
||||
final String mergeRelsInputPath,
|
||||
final String entitiesInputPath,
|
||||
final Class<OafEntity> clazz) {
|
||||
private DedupRecordFactory() {
|
||||
}
|
||||
|
||||
final long ts = System.currentTimeMillis();
|
||||
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
|
||||
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
|
||||
public static Dataset<OafEntity> createDedupRecord(
|
||||
final SparkSession spark,
|
||||
final DataInfo dataInfo,
|
||||
final String mergeRelsInputPath,
|
||||
final String entitiesInputPath,
|
||||
final Class<OafEntity> clazz) {
|
||||
|
||||
// <id, json_entity>
|
||||
Dataset<Row> entities = spark
|
||||
.read()
|
||||
.schema(Encoders.bean(clazz).schema())
|
||||
.json(entitiesInputPath)
|
||||
.as(beanEncoder)
|
||||
.map(
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
|
||||
return new Tuple2<>(entity.getId(), entity);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), kryoEncoder))
|
||||
.selectExpr("_1 AS id", "_2 AS kryoObject");
|
||||
final long ts = System.currentTimeMillis();
|
||||
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
|
||||
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
|
||||
|
||||
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||
Dataset<Row> mergeRels = spark
|
||||
.read()
|
||||
.load(mergeRelsInputPath)
|
||||
.where("relClass == 'merges'")
|
||||
.selectExpr("source as dedupId", "target as id");
|
||||
// <id, json_entity>
|
||||
Dataset<Row> entities = spark
|
||||
.read()
|
||||
.schema(Encoders.bean(clazz).schema())
|
||||
.json(entitiesInputPath)
|
||||
.as(beanEncoder)
|
||||
.map(
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
|
||||
return new Tuple2<>(entity.getId(), entity);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), kryoEncoder))
|
||||
.selectExpr("_1 AS id", "_2 AS kryoObject");
|
||||
|
||||
return mergeRels
|
||||
.join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
||||
.select("dedupId", "id", "kryoObject")
|
||||
.as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder))
|
||||
.map((MapFunction<Tuple3<String, String, OafEntity>, DedupRecordReduceState>) t -> new DedupRecordReduceState(t._1(), t._2(), t._3()), Encoders.kryo(DedupRecordReduceState.class))
|
||||
.groupByKey((MapFunction<DedupRecordReduceState, String>) DedupRecordReduceState::getDedupId, Encoders.STRING())
|
||||
.reduceGroups(
|
||||
(ReduceFunction<DedupRecordReduceState>) (t1, t2) -> {
|
||||
if (t1.entity == null) {
|
||||
t2.aliases.addAll(t1.aliases);
|
||||
return t2;
|
||||
}
|
||||
if (t1.acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
||||
t1.acceptanceDate.addAll(t2.acceptanceDate);
|
||||
}
|
||||
t1.aliases.addAll(t2.aliases);
|
||||
t1.entity = reduceEntity(t1.entity, t2.entity);
|
||||
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||
Dataset<Row> mergeRels = spark
|
||||
.read()
|
||||
.load(mergeRelsInputPath)
|
||||
.where("relClass == 'merges'")
|
||||
.selectExpr("source as dedupId", "target as id");
|
||||
|
||||
return t1;
|
||||
}
|
||||
)
|
||||
.flatMap
|
||||
((FlatMapFunction<Tuple2<String, DedupRecordReduceState>, OafEntity>) t -> {
|
||||
String dedupId = t._1();
|
||||
DedupRecordReduceState agg = t._2();
|
||||
return mergeRels
|
||||
.join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
||||
.select("dedupId", "id", "kryoObject")
|
||||
.as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder))
|
||||
.map(
|
||||
(MapFunction<Tuple3<String, String, OafEntity>, DedupRecordReduceState>) t -> new DedupRecordReduceState(
|
||||
t._1(), t._2(), t._3()),
|
||||
Encoders.kryo(DedupRecordReduceState.class))
|
||||
.groupByKey(
|
||||
(MapFunction<DedupRecordReduceState, String>) DedupRecordReduceState::getDedupId, Encoders.STRING())
|
||||
.reduceGroups(
|
||||
(ReduceFunction<DedupRecordReduceState>) (t1, t2) -> {
|
||||
if (t1.entity == null) {
|
||||
t2.aliases.addAll(t1.aliases);
|
||||
return t2;
|
||||
}
|
||||
if (t1.acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
||||
t1.acceptanceDate.addAll(t2.acceptanceDate);
|
||||
}
|
||||
t1.aliases.addAll(t2.aliases);
|
||||
t1.entity = reduceEntity(t1.entity, t2.entity);
|
||||
|
||||
if (agg.acceptanceDate.size() >= MAX_ACCEPTANCE_DATE) {
|
||||
return Collections.emptyIterator();
|
||||
}
|
||||
return t1;
|
||||
})
|
||||
.flatMap((FlatMapFunction<Tuple2<String, DedupRecordReduceState>, OafEntity>) t -> {
|
||||
String dedupId = t._1();
|
||||
DedupRecordReduceState agg = t._2();
|
||||
|
||||
return Stream.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
|
||||
.map(id -> {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
|
||||
res.setId(id);
|
||||
res.setDataInfo(dataInfo);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}).iterator();
|
||||
}, beanEncoder);
|
||||
}
|
||||
if (agg.acceptanceDate.size() >= MAX_ACCEPTANCE_DATE) {
|
||||
return Collections.emptyIterator();
|
||||
}
|
||||
|
||||
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
|
||||
return Stream
|
||||
.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
|
||||
.map(id -> {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
|
||||
res.setId(id);
|
||||
res.setDataInfo(dataInfo);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.iterator();
|
||||
}, beanEncoder);
|
||||
}
|
||||
|
||||
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
|
||||
|
||||
if (duplicate == null) {
|
||||
return entity;
|
||||
}
|
||||
|
||||
int compare = new IdentifierComparator<>()
|
||||
.compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate));
|
||||
|
||||
int compare = new IdentifierComparator<>()
|
||||
.compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate));
|
||||
|
||||
if (compare > 0) {
|
||||
if (compare > 0) {
|
||||
OafEntity swap = duplicate;
|
||||
duplicate = entity;
|
||||
entity = swap;
|
||||
}
|
||||
duplicate = entity;
|
||||
entity = swap;
|
||||
}
|
||||
|
||||
entity.mergeFrom(duplicate);
|
||||
entity.mergeFrom(duplicate);
|
||||
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result re = (Result) entity;
|
||||
Result rd = (Result) duplicate;
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result re = (Result) entity;
|
||||
Result rd = (Result) duplicate;
|
||||
|
||||
List<List<Author>> authors = new ArrayList<>();
|
||||
if (re.getAuthor() != null) {
|
||||
authors.add(re.getAuthor());
|
||||
}
|
||||
if (rd.getAuthor() != null) {
|
||||
authors.add(rd.getAuthor());
|
||||
}
|
||||
List<List<Author>> authors = new ArrayList<>();
|
||||
if (re.getAuthor() != null) {
|
||||
authors.add(re.getAuthor());
|
||||
}
|
||||
if (rd.getAuthor() != null) {
|
||||
authors.add(rd.getAuthor());
|
||||
}
|
||||
|
||||
re.setAuthor(AuthorMerger.merge(authors));
|
||||
}
|
||||
re.setAuthor(AuthorMerger.merge(authors));
|
||||
}
|
||||
|
||||
return entity;
|
||||
}
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> T entityMerger(
|
||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) {
|
||||
T base = entities.next()._2();
|
||||
public static <T extends OafEntity> T entityMerger(
|
||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) {
|
||||
T base = entities.next()._2();
|
||||
|
||||
while (entities.hasNext()) {
|
||||
T duplicate = entities.next()._2();
|
||||
if (duplicate != null)
|
||||
base = (T) reduceEntity(base, duplicate);
|
||||
}
|
||||
while (entities.hasNext()) {
|
||||
T duplicate = entities.next()._2();
|
||||
if (duplicate != null)
|
||||
base = (T) reduceEntity(base, duplicate);
|
||||
}
|
||||
|
||||
base.setId(id);
|
||||
base.setDataInfo(dataInfo);
|
||||
base.setLastupdatetimestamp(ts);
|
||||
base.setId(id);
|
||||
base.setDataInfo(dataInfo);
|
||||
base.setLastupdatetimestamp(ts);
|
||||
|
||||
return base;
|
||||
}
|
||||
return base;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -242,13 +242,14 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
|||
|
||||
// this was a pivot in a previous graph but it has been merged into a new group with different
|
||||
// pivot
|
||||
if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id) && !dedupId.equals(pivotDedupId)) {
|
||||
if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id)
|
||||
&& !dedupId.equals(pivotDedupId)) {
|
||||
// materialize the previous dedup record as a merge relation with the new one
|
||||
res.add(new Tuple3<>(dedupId, pivotDedupId, null));
|
||||
}
|
||||
|
||||
// add merge relations
|
||||
if (cut <=0 || r.<Integer>getAs("position") <= cut) {
|
||||
if (cut <= 0 || r.<Integer> getAs("position") <= cut) {
|
||||
res.add(new Tuple3<>(id, pivotDedupId, pivot));
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue