1
0
Fork 0

code formatting

This commit is contained in:
Claudio Atzori 2024-01-23 08:47:12 +01:00
parent bd187ec6e7
commit 6fd25cf549
3 changed files with 158 additions and 149 deletions

View File

@ -1,13 +1,7 @@
package eu.dnetlib.dhp.oozie;
import com.google.common.io.Resources;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.text.StringSubstitutor;
import org.apache.spark.SparkConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.net.URL;
import java.nio.charset.StandardCharsets;
@ -15,7 +9,15 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.text.StringSubstitutor;
import org.apache.spark.SparkConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Resources;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class RunSQLSparkJob {
private static final Logger log = LoggerFactory.getLogger(RunSQLSparkJob.class);

View File

@ -1,6 +1,16 @@
package eu.dnetlib.dhp.oa.dedup;
import java.util.*;
import java.util.stream.Stream;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -8,19 +18,10 @@ import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.JavaConversions;
import java.util.*;
import java.util.stream.Stream;
public class DedupRecordFactory {
public static final class DedupRecordReduceState {
public final String dedupId;
@ -39,7 +40,8 @@ public class DedupRecordFactory {
} else {
if (Result.class.isAssignableFrom(entity.getClass())) {
Result result = (Result) entity;
if (result.getDateofacceptance() != null && StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
if (result.getDateofacceptance() != null
&& StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
acceptanceDate.add(result.getDateofacceptance().getValue());
}
}
@ -50,6 +52,7 @@ public class DedupRecordFactory {
return dedupId;
}
}
private static final int MAX_ACCEPTANCE_DATE = 20;
private DedupRecordFactory() {
@ -90,8 +93,12 @@ public class DedupRecordFactory {
.join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
.select("dedupId", "id", "kryoObject")
.as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder))
.map((MapFunction<Tuple3<String, String, OafEntity>, DedupRecordReduceState>) t -> new DedupRecordReduceState(t._1(), t._2(), t._3()), Encoders.kryo(DedupRecordReduceState.class))
.groupByKey((MapFunction<DedupRecordReduceState, String>) DedupRecordReduceState::getDedupId, Encoders.STRING())
.map(
(MapFunction<Tuple3<String, String, OafEntity>, DedupRecordReduceState>) t -> new DedupRecordReduceState(
t._1(), t._2(), t._3()),
Encoders.kryo(DedupRecordReduceState.class))
.groupByKey(
(MapFunction<DedupRecordReduceState, String>) DedupRecordReduceState::getDedupId, Encoders.STRING())
.reduceGroups(
(ReduceFunction<DedupRecordReduceState>) (t1, t2) -> {
if (t1.entity == null) {
@ -105,10 +112,8 @@ public class DedupRecordFactory {
t1.entity = reduceEntity(t1.entity, t2.entity);
return t1;
}
)
.flatMap
((FlatMapFunction<Tuple2<String, DedupRecordReduceState>, OafEntity>) t -> {
})
.flatMap((FlatMapFunction<Tuple2<String, DedupRecordReduceState>, OafEntity>) t -> {
String dedupId = t._1();
DedupRecordReduceState agg = t._2();
@ -116,7 +121,8 @@ public class DedupRecordFactory {
return Collections.emptyIterator();
}
return Stream.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
return Stream
.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
.map(id -> {
try {
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
@ -127,7 +133,8 @@ public class DedupRecordFactory {
} catch (Exception e) {
throw new RuntimeException(e);
}
}).iterator();
})
.iterator();
}, beanEncoder);
}
@ -137,7 +144,6 @@ public class DedupRecordFactory {
return entity;
}
int compare = new IdentifierComparator<>()
.compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate));

View File

@ -242,13 +242,14 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
// this was a pivot in a previous graph but it has been merged into a new group with different
// pivot
if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id) && !dedupId.equals(pivotDedupId)) {
if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id)
&& !dedupId.equals(pivotDedupId)) {
// materialize the previous dedup record as a merge relation with the new one
res.add(new Tuple3<>(dedupId, pivotDedupId, null));
}
// add merge relations
if (cut <=0 || r.<Integer>getAs("position") <= cut) {
if (cut <= 0 || r.<Integer> getAs("position") <= cut) {
res.add(new Tuple3<>(id, pivotDedupId, pivot));
}