dnet-hadoop/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java

275 lines
11 KiB
Java

package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.util.Collection;
public class DedupRecordFactory {
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
long ts = System.currentTimeMillis();
//<id, json_entity>
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
.mapToPair((PairFunction<String, String, String>) it ->
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)
);
//<source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, String> mergeRels = spark
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.mapToPair(
(PairFunction<Relation, String, String>) r ->
new Tuple2<String, String>(r.getTarget(), r.getSource())
);
//<dedup_id, json_entity_merged>
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
switch (entityType) {
case publication:
return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
case dataset:
return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
case project:
return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
case software:
return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
case datasource:
return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
case organization:
return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
case otherresearchproduct:
return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
default:
return null;
}
}
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Publication p = new Publication(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(pub -> {
try {
Publication publication = mapper.readValue(pub, Publication.class);
p.mergeFrom(publication);
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
//add to the list if they are not null
if (publication.getDateofacceptance() != null)
dateofacceptance.add(publication.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Dataset d = new Dataset(); //the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(dat -> {
try {
Dataset dataset = mapper.readValue(dat, Dataset.class);
d.mergeFrom(dataset);
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
//add to the list if they are not null
if (dataset.getDateofacceptance() != null)
dateofacceptance.add(dataset.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Project p = new Project(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
if (e._2() != null)
e._2().forEach(proj -> {
try {
Project project = mapper.readValue(proj, Project.class);
p.mergeFrom(project);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Software s = new Software(); //the result of the merge, to be returned at the end
s.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(soft -> {
try {
Software software = mapper.readValue(soft, Software.class);
s.mergeFrom(software);
s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
//add to the list if they are not null
if (software.getDateofacceptance() != null)
dateofacceptance.add(software.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
s.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (s.getDataInfo() == null)
s.setDataInfo(new DataInfo());
s.getDataInfo().setTrust("0.9");
s.setLastupdatetimestamp(ts);
return s;
}
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Datasource d = new Datasource(); //the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
if (e._2() != null)
e._2().forEach(dat -> {
try {
Datasource datasource = mapper.readValue(dat, Datasource.class);
d.mergeFrom(datasource);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Organization o = new Organization(); //the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
StringBuilder trust = new StringBuilder("0.0");
if (e._2() != null)
e._2().forEach(pub -> {
try {
Organization organization = mapper.readValue(pub, Organization.class);
final String currentTrust = organization.getDataInfo().getTrust();
if (!"1.0".equals(currentTrust)) {
trust.setLength(0);
trust.append(currentTrust);
}
o.mergeFrom(organization);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null)
{
o.setDataInfo(new DataInfo());
}
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(orp -> {
try {
OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class);
o.mergeFrom(otherResearchProduct);
o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
//add to the list if they are not null
if (otherResearchProduct.getDateofacceptance() != null)
dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.setDateofacceptance(DatePicker.pick(dateofacceptance));
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
}