modification of the jobs for the integration of openorgs in the provision, dedup records are no more created by merging but simply taking results of openorgs portal

pull/104/head
miconis 3 years ago
parent 1e7e5180fa
commit c39c82dfe9

@ -78,42 +78,16 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
DedupConfig dedupConf = getConfigurations(isLookUpService, actionSetId).get(0);
JavaRDD<Relation> rawRels = spark
//collect organization merge relations from openorgs database
JavaRDD<Relation> mergeRelsRDD = spark
.read()
.textFile(relationPath)
.map(patchRelFn(), Encoders.bean(Relation.class))
.toJavaRDD()
.filter(this::isOpenorgs)
.filter(this::filterOpenorgsRels);
JavaRDD<Relation> selfRawRels = rawRels
.map(r -> r.getSource())
.distinct()
.map(s -> rel(s, s, ModelConstants.IS_SIMILAR_TO, dedupConf));
log.info("Number of raw Openorgs Relations collected: {}", rawRels.count());
// turn openorgs isSimilarTo relations into mergerels
JavaRDD<Relation> mergeRelsRDD = rawRels
.union(selfRawRels)
.map(r -> {
r.setSource(createDedupID(r.getSource())); // create the dedup_id to align it to the openaire dedup
// format
return r;
})
.flatMap(rel -> {
List<Relation> mergerels = new ArrayList<>();
mergerels.add(rel(rel.getSource(), rel.getTarget(), ModelConstants.MERGES, dedupConf));
mergerels.add(rel(rel.getTarget(), rel.getSource(), ModelConstants.IS_MERGED_IN, dedupConf));
.filter(this::isOpenorgs) //take only openorgs relations
.filter(this::isMergeRel); //take merges and isMergedIn relations
return mergerels.iterator();
});
log.info("Number of Openorgs Merge Relations created: {}", mergeRelsRDD.count());
log.info("Number of Openorgs Merge Relations collected: {}", mergeRelsRDD.count());
spark
.createDataset(
@ -124,45 +98,9 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
.parquet(outputPath);
}
private boolean filterOpenorgsRels(Relation rel) {
return rel.getRelClass().equals(ModelConstants.IS_SIMILAR_TO)
private boolean isMergeRel(Relation rel) {
return (rel.getRelClass().equals(ModelConstants.MERGES) || rel.getRelClass().equals(ModelConstants.IS_MERGED_IN))
&& rel.getRelType().equals(ModelConstants.ORG_ORG_RELTYPE)
&& rel.getSubRelType().equals(ModelConstants.DEDUP);
}
private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) {
String entityType = dedupConf.getWf().getEntityType();
Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setRelClass(relClass);
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
r.setSubRelType(ModelConstants.DEDUP);
DataInfo info = new DataInfo();
info.setDeletedbyinference(false);
info.setInferred(true);
info.setInvisible(false);
info.setInferenceprovenance(dedupConf.getWf().getConfigurationId());
Qualifier provenanceAction = new Qualifier();
provenanceAction.setClassid(ModelConstants.PROVENANCE_DEDUP);
provenanceAction.setClassname(ModelConstants.PROVENANCE_DEDUP);
provenanceAction.setSchemeid(ModelConstants.DNET_PROVENANCE_ACTIONS);
provenanceAction.setSchemename(ModelConstants.DNET_PROVENANCE_ACTIONS);
info.setProvenanceaction(provenanceAction);
// TODO calculate the trust value based on the similarity score of the elements in the CC
// info.setTrust();
r.setDataInfo(info);
return r;
}
public String createDedupID(String id) {
String prefix = id.split("\\|")[0];
return prefix + "|dedup_wf_001::" + DHPUtils.md5(id);
}
}

@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -24,11 +26,12 @@ import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class SparkCopyOpenorgs extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgs.class);
public class SparkCreateOrgsDedupRecord extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCreateOrgsDedupRecord.class);
public SparkCopyOpenorgs(ArgumentApplicationParser parser, SparkSession spark) {
public SparkCreateOrgsDedupRecord(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
@ -36,13 +39,13 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCopyOpenorgs.class
SparkCreateOrgsDedupRecord.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/copyOpenorgs_parameters.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCopyOpenorgs(parser, getSparkSession(conf))
new SparkCreateOrgsDedupRecord(parser, getSparkSession(conf))
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
}
@ -64,14 +67,15 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath);
String subEntity = "organization";
log.info("Copying openorgs to the working dir");
log.info("Copying organization dedup records to the working dir");
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity);
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, "organization");
final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity);
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
filterOpenorgs(spark, entityPath)
final String mergeRelsPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
rootOrganization(spark, entityPath, mergeRelsPath)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
@ -79,26 +83,43 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
}
public static Dataset<Organization> filterOpenorgs(
public static Dataset<Organization> rootOrganization(
final SparkSession spark,
final String entitiesInputPath) {
final String entitiesInputPath,
final String mergeRelsPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Dataset<Organization> entities = spark
.createDataset(
sc
.textFile(entitiesInputPath)
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
.rdd(),
Encoders.bean(Organization.class));
JavaPairRDD<String, Organization> entities = sc.textFile(entitiesInputPath)
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
.mapToPair(o -> new Tuple2<>(o.getId(), o));
log.info("Number of organization entities processed: {}", entities.count());
entities = entities.filter(entities.col("id").contains(DedupUtility.OPENORGS_ID_PREFIX));
//collect root ids (ids in the source of 'merges' relations
JavaPairRDD<String, String> roots = spark
.read()
.load(mergeRelsPath)
.as(Encoders.bean(Relation.class))
.where("relClass == 'merges'")
.map(
(MapFunction<Relation, Tuple2<String, String>>) r -> new Tuple2<>(r.getSource(), "root"),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.toJavaRDD()
.mapToPair(t -> t)
.distinct();
Dataset<Organization> rootOrgs = spark.createDataset(
entities
.leftOuterJoin(roots)
.filter(e -> e._2()._2().isPresent()) //if it has been joined with 'root' then it's a root record
.map(e -> e._2()._1())
.rdd(),
Encoders.bean(Organization.class));
log.info("Number of Openorgs organization entities: {}", entities.count());
log.info("Number of Root organization: {}", entities.count());
return entities;
return rootOrgs;
}
}

@ -101,6 +101,9 @@ public class SparkUpdateEntity extends AbstractSparkAction {
.mapToPair(
(PairFunction<String, String, String>) s -> new Tuple2<>(
MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
if (type == EntityType.organization) //exclude root records from organizations
entitiesWithId = excludeRootOrgs(entitiesWithId, rel);
JavaRDD<String> map = entitiesWithId
.leftOuterJoin(mergedIds)
.map(k -> {
@ -110,13 +113,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
return k._2()._1();
});
if (type == EntityType.organization) // exclude openorgs with deletedbyinference=true
map = map.filter(it -> {
Organization org = OBJECT_MAPPER.readValue(it, Organization.class);
return !org.getId().contains("openorgs____") || (org.getId().contains("openorgs____")
&& !org.getDataInfo().getDeletedbyinference());
});
sourceEntity = map.union(sc.textFile(dedupRecordPath));
}
@ -159,4 +155,20 @@ public class SparkUpdateEntity extends AbstractSparkAction {
throw new RuntimeException("Unable to convert json", e);
}
}
private static JavaPairRDD<String, String> excludeRootOrgs(JavaPairRDD<String, String> entitiesWithId, Dataset<Relation> rel) {
JavaPairRDD<String, String> roots = rel
.where("relClass == 'merges'")
.select(rel.col("source"))
.distinct()
.toJavaRDD()
.mapToPair(
(PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "root"));
return entitiesWithId
.leftOuterJoin(roots)
.filter(e -> !e._2()._2().isPresent())
.mapToPair(e -> new Tuple2<>(e._1(), e._2()._1()));
}
}

@ -187,7 +187,7 @@
<error to="Kill"/>
</action>
<!-- copy organization relations in the working dir (in the organization_mergerel dir)-->
<!-- copy organization merge relations in the working dir (in the organization_mergerel dir)-->
<action name="CopyOpenorgsMergeRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -220,7 +220,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Create Organizations Dedup Records</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -241,33 +241,6 @@
<error to="Kill"/>
</action>
<!--TODO replace with job for the creation of deduprecord for openorgs organizations -->
<!-- copy openorgs to the working dir (in the organization_deduprecord dir)-->
<!--<action name="CopyOpenorgs">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Copy Openorgs Entities</name>-->
<!--<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgs</class>-->
<!--<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;graphBasePath</arg><arg>${graphBasePath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>-->
<!--</spark>-->
<!--<ok to="UpdateEntity"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<action name="UpdateEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>

@ -12,10 +12,14 @@ import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
@ -148,9 +152,14 @@ public class SparkOpenorgsTest implements Serializable {
long orgs_mergerel = spark
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
Dataset<Relation> orgrels = spark.read().load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization")).as(Encoders.bean(Relation.class));
for (Relation r: orgrels.toJavaRDD().collect())
System.out.println("r = " + r.getSource() + "---" + r.getTarget() + "---" + r.getRelClass());
assertEquals(384, orgs_mergerel);
}

@ -180,14 +180,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
log.info("Processing Openorgs Merge Rels...");
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgSimRels);
//TODO cambiare il mapping delle relazioni in modo che crei merges e isMergedIn
// TODO (specifico per questo caso, questa funzione di mapping verrà usata così com'è nel caso di openorgs dedup
break;
case openaire_organizations:
log.info("Processing Organizations...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix);
break;
}
log.info("All done.");

@ -50,4 +50,7 @@ GROUP BY
o.trust,
d.id,
d.officialname,
o.country;
o.country;
-- TODO modificare in modo da fare il merge dei pid di tutti i record mergiati (per gli openorgs, approvati)
-- TODO invece per tutti gli altri con dei duplicati non fare questa cosa
Loading…
Cancel
Save