forked from antonis.lempesis/dnet-hadoop
[dedup] trivial refactoring
This commit is contained in:
parent
c0750fb17c
commit
a24b9f8268
|
@ -14,6 +14,9 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -152,4 +155,25 @@ public class DedupUtility {
|
||||||
return o1.compareTo(o2);
|
return o1.compareTo(o2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Relation createSimRel(String source, String target, String entity) {
|
||||||
|
final Relation r = new Relation();
|
||||||
|
r.setSource(source);
|
||||||
|
r.setTarget(target);
|
||||||
|
r.setSubRelType("dedupSimilarity");
|
||||||
|
r.setRelClass(ModelConstants.IS_SIMILAR_TO);
|
||||||
|
r.setDataInfo(new DataInfo());
|
||||||
|
|
||||||
|
switch (entity) {
|
||||||
|
case "result":
|
||||||
|
r.setRelType(ModelConstants.RESULT_RESULT);
|
||||||
|
break;
|
||||||
|
case "organization":
|
||||||
|
r.setRelType(ModelConstants.ORG_ORG_RELTYPE);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
@ -102,7 +103,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
.createDataset(
|
.createDataset(
|
||||||
Deduper
|
Deduper
|
||||||
.computeRelations(sc, blocks, dedupConf)
|
.computeRelations(sc, blocks, dedupConf)
|
||||||
.map(t -> createSimRel(t._1(), t._2(), entity))
|
.map(t -> DedupUtility.createSimRel(t._1(), t._2(), entity))
|
||||||
.repartition(numPartitions)
|
.repartition(numPartitions)
|
||||||
.rdd(),
|
.rdd(),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
@ -111,24 +112,4 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setSource(source);
|
|
||||||
r.setTarget(target);
|
|
||||||
r.setSubRelType("dedupSimilarity");
|
|
||||||
r.setRelClass("isSimilarTo");
|
|
||||||
r.setDataInfo(new DataInfo());
|
|
||||||
|
|
||||||
switch (entity) {
|
|
||||||
case "result":
|
|
||||||
r.setRelType("resultResult");
|
|
||||||
break;
|
|
||||||
case "organization":
|
|
||||||
r.setRelType("organizationOrganization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -124,31 +124,12 @@ public class SparkWhitelistSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
Dataset<Relation> whiteListSimRels = whiteListRels2
|
Dataset<Relation> whiteListSimRels = whiteListRels2
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<String, String>, Relation>) r -> createSimRel(r._1(), r._2(), entity),
|
(MapFunction<Tuple2<String, String>, Relation>) r -> DedupUtility
|
||||||
|
.createSimRel(r._1(), r._2(), entity),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
|
saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setSource(source);
|
|
||||||
r.setTarget(target);
|
|
||||||
r.setSubRelType("dedupSimilarity");
|
|
||||||
r.setRelClass("isSimilarTo");
|
|
||||||
r.setDataInfo(new DataInfo());
|
|
||||||
|
|
||||||
switch (entity) {
|
|
||||||
case "result":
|
|
||||||
r.setRelType("resultResult");
|
|
||||||
break;
|
|
||||||
case "organization":
|
|
||||||
r.setRelType("organizationOrganization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue