1
0
Fork 0

[dedup] trivial refactoring

This commit is contained in:
Claudio Atzori 2021-11-18 17:12:02 +01:00
parent c0750fb17c
commit a24b9f8268
3 changed files with 28 additions and 42 deletions

View File

@ -14,6 +14,9 @@ import org.xml.sax.SAXException;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -152,4 +155,25 @@ public class DedupUtility {
return o1.compareTo(o2); return o1.compareTo(o2);
} }
public static Relation createSimRel(String source, String target, String entity) {
final Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setSubRelType("dedupSimilarity");
r.setRelClass(ModelConstants.IS_SIMILAR_TO);
r.setDataInfo(new DataInfo());
switch (entity) {
case "result":
r.setRelType(ModelConstants.RESULT_RESULT);
break;
case "organization":
r.setRelType(ModelConstants.ORG_ORG_RELTYPE);
break;
default:
throw new IllegalArgumentException("unmanaged entity type: " + entity);
}
return r;
}
} }

View File

@ -20,6 +20,7 @@ import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
@ -102,7 +103,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
.createDataset( .createDataset(
Deduper Deduper
.computeRelations(sc, blocks, dedupConf) .computeRelations(sc, blocks, dedupConf)
.map(t -> createSimRel(t._1(), t._2(), entity)) .map(t -> DedupUtility.createSimRel(t._1(), t._2(), entity))
.repartition(numPartitions) .repartition(numPartitions)
.rdd(), .rdd(),
Encoders.bean(Relation.class)); Encoders.bean(Relation.class));
@ -111,24 +112,4 @@ public class SparkCreateSimRels extends AbstractSparkAction {
} }
} }
private Relation createSimRel(String source, String target, String entity) {
final Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setSubRelType("dedupSimilarity");
r.setRelClass("isSimilarTo");
r.setDataInfo(new DataInfo());
switch (entity) {
case "result":
r.setRelType("resultResult");
break;
case "organization":
r.setRelType("organizationOrganization");
break;
default:
throw new IllegalArgumentException("unmanaged entity type: " + entity);
}
return r;
}
} }

View File

@ -124,31 +124,12 @@ public class SparkWhitelistSimRels extends AbstractSparkAction {
Dataset<Relation> whiteListSimRels = whiteListRels2 Dataset<Relation> whiteListSimRels = whiteListRels2
.map( .map(
(MapFunction<Tuple2<String, String>, Relation>) r -> createSimRel(r._1(), r._2(), entity), (MapFunction<Tuple2<String, String>, Relation>) r -> DedupUtility
.createSimRel(r._1(), r._2(), entity),
Encoders.bean(Relation.class)); Encoders.bean(Relation.class));
saveParquet(whiteListSimRels, outputPath, SaveMode.Append); saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
} }
} }
private Relation createSimRel(String source, String target, String entity) {
final Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setSubRelType("dedupSimilarity");
r.setRelClass("isSimilarTo");
r.setDataInfo(new DataInfo());
switch (entity) {
case "result":
r.setRelType("resultResult");
break;
case "organization":
r.setRelType("organizationOrganization");
break;
default:
throw new IllegalArgumentException("unmanaged entity type: " + entity);
}
return r;
}
} }