modification of the jobs for the integration of openorgs in the provision, dedup records are no more created by merging but simply taking results of openorgs portal

This commit is contained in:
miconis 2021-04-06 14:31:00 +02:00
parent 1e7e5180fa
commit c39c82dfe9
7 changed files with 87 additions and 131 deletions

View File

@ -78,42 +78,16 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
DedupConfig dedupConf = getConfigurations(isLookUpService, actionSetId).get(0);
JavaRDD<Relation> rawRels = spark
//collect organization merge relations from openorgs database
JavaRDD<Relation> mergeRelsRDD = spark
.read()
.textFile(relationPath)
.map(patchRelFn(), Encoders.bean(Relation.class))
.toJavaRDD()
.filter(this::isOpenorgs)
.filter(this::filterOpenorgsRels);
.filter(this::isOpenorgs) //take only openorgs relations
.filter(this::isMergeRel); //take merges and isMergedIn relations
JavaRDD<Relation> selfRawRels = rawRels
.map(r -> r.getSource())
.distinct()
.map(s -> rel(s, s, ModelConstants.IS_SIMILAR_TO, dedupConf));
log.info("Number of raw Openorgs Relations collected: {}", rawRels.count());
// turn openorgs isSimilarTo relations into mergerels
JavaRDD<Relation> mergeRelsRDD = rawRels
.union(selfRawRels)
.map(r -> {
r.setSource(createDedupID(r.getSource())); // create the dedup_id to align it to the openaire dedup
// format
return r;
})
.flatMap(rel -> {
List<Relation> mergerels = new ArrayList<>();
mergerels.add(rel(rel.getSource(), rel.getTarget(), ModelConstants.MERGES, dedupConf));
mergerels.add(rel(rel.getTarget(), rel.getSource(), ModelConstants.IS_MERGED_IN, dedupConf));
return mergerels.iterator();
});
log.info("Number of Openorgs Merge Relations created: {}", mergeRelsRDD.count());
log.info("Number of Openorgs Merge Relations collected: {}", mergeRelsRDD.count());
spark
.createDataset(
@ -124,45 +98,9 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
.parquet(outputPath);
}
private boolean filterOpenorgsRels(Relation rel) {
return rel.getRelClass().equals(ModelConstants.IS_SIMILAR_TO)
private boolean isMergeRel(Relation rel) {
return (rel.getRelClass().equals(ModelConstants.MERGES) || rel.getRelClass().equals(ModelConstants.IS_MERGED_IN))
&& rel.getRelType().equals(ModelConstants.ORG_ORG_RELTYPE)
&& rel.getSubRelType().equals(ModelConstants.DEDUP);
}
private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) {
String entityType = dedupConf.getWf().getEntityType();
Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
r.setRelClass(relClass);
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
r.setSubRelType(ModelConstants.DEDUP);
DataInfo info = new DataInfo();
info.setDeletedbyinference(false);
info.setInferred(true);
info.setInvisible(false);
info.setInferenceprovenance(dedupConf.getWf().getConfigurationId());
Qualifier provenanceAction = new Qualifier();
provenanceAction.setClassid(ModelConstants.PROVENANCE_DEDUP);
provenanceAction.setClassname(ModelConstants.PROVENANCE_DEDUP);
provenanceAction.setSchemeid(ModelConstants.DNET_PROVENANCE_ACTIONS);
provenanceAction.setSchemename(ModelConstants.DNET_PROVENANCE_ACTIONS);
info.setProvenanceaction(provenanceAction);
// TODO calculate the trust value based on the similarity score of the elements in the CC
// info.setTrust();
r.setDataInfo(info);
return r;
}
public String createDedupID(String id) {
String prefix = id.split("\\|")[0];
return prefix + "|dedup_wf_001::" + DHPUtils.md5(id);
}
}

View File

@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -24,11 +26,12 @@ import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class SparkCopyOpenorgs extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgs.class);
public class SparkCreateOrgsDedupRecord extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCreateOrgsDedupRecord.class);
public SparkCopyOpenorgs(ArgumentApplicationParser parser, SparkSession spark) {
public SparkCreateOrgsDedupRecord(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
@ -36,13 +39,13 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCopyOpenorgs.class
SparkCreateOrgsDedupRecord.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/copyOpenorgs_parameters.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCopyOpenorgs(parser, getSparkSession(conf))
new SparkCreateOrgsDedupRecord(parser, getSparkSession(conf))
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
}
@ -64,14 +67,15 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath);
String subEntity = "organization";
log.info("Copying openorgs to the working dir");
log.info("Copying organization dedup records to the working dir");
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity);
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, "organization");
final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity);
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
filterOpenorgs(spark, entityPath)
final String mergeRelsPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
rootOrganization(spark, entityPath, mergeRelsPath)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
@ -79,26 +83,43 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
}
public static Dataset<Organization> filterOpenorgs(
public static Dataset<Organization> rootOrganization(
final SparkSession spark,
final String entitiesInputPath) {
final String entitiesInputPath,
final String mergeRelsPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Dataset<Organization> entities = spark
.createDataset(
sc
.textFile(entitiesInputPath)
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
.rdd(),
Encoders.bean(Organization.class));
JavaPairRDD<String, Organization> entities = sc.textFile(entitiesInputPath)
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
.mapToPair(o -> new Tuple2<>(o.getId(), o));
log.info("Number of organization entities processed: {}", entities.count());
entities = entities.filter(entities.col("id").contains(DedupUtility.OPENORGS_ID_PREFIX));
//collect root ids (ids in the source of 'merges' relations
JavaPairRDD<String, String> roots = spark
.read()
.load(mergeRelsPath)
.as(Encoders.bean(Relation.class))
.where("relClass == 'merges'")
.map(
(MapFunction<Relation, Tuple2<String, String>>) r -> new Tuple2<>(r.getSource(), "root"),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.toJavaRDD()
.mapToPair(t -> t)
.distinct();
log.info("Number of Openorgs organization entities: {}", entities.count());
Dataset<Organization> rootOrgs = spark.createDataset(
entities
.leftOuterJoin(roots)
.filter(e -> e._2()._2().isPresent()) //if it has been joined with 'root' then it's a root record
.map(e -> e._2()._1())
.rdd(),
Encoders.bean(Organization.class));
return entities;
log.info("Number of Root organization: {}", entities.count());
return rootOrgs;
}
}

View File

@ -101,6 +101,9 @@ public class SparkUpdateEntity extends AbstractSparkAction {
.mapToPair(
(PairFunction<String, String, String>) s -> new Tuple2<>(
MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
if (type == EntityType.organization) //exclude root records from organizations
entitiesWithId = excludeRootOrgs(entitiesWithId, rel);
JavaRDD<String> map = entitiesWithId
.leftOuterJoin(mergedIds)
.map(k -> {
@ -110,13 +113,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
return k._2()._1();
});
if (type == EntityType.organization) // exclude openorgs with deletedbyinference=true
map = map.filter(it -> {
Organization org = OBJECT_MAPPER.readValue(it, Organization.class);
return !org.getId().contains("openorgs____") || (org.getId().contains("openorgs____")
&& !org.getDataInfo().getDeletedbyinference());
});
sourceEntity = map.union(sc.textFile(dedupRecordPath));
}
@ -159,4 +155,20 @@ public class SparkUpdateEntity extends AbstractSparkAction {
throw new RuntimeException("Unable to convert json", e);
}
}
private static JavaPairRDD<String, String> excludeRootOrgs(JavaPairRDD<String, String> entitiesWithId, Dataset<Relation> rel) {
JavaPairRDD<String, String> roots = rel
.where("relClass == 'merges'")
.select(rel.col("source"))
.distinct()
.toJavaRDD()
.mapToPair(
(PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "root"));
return entitiesWithId
.leftOuterJoin(roots)
.filter(e -> !e._2()._2().isPresent())
.mapToPair(e -> new Tuple2<>(e._1(), e._2()._1()));
}
}

View File

@ -187,7 +187,7 @@
<error to="Kill"/>
</action>
<!-- copy organization relations in the working dir (in the organization_mergerel dir)-->
<!-- copy organization merge relations in the working dir (in the organization_mergerel dir)-->
<action name="CopyOpenorgsMergeRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -220,7 +220,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Create Organizations Dedup Records</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -241,33 +241,6 @@
<error to="Kill"/>
</action>
<!--TODO replace with job for the creation of deduprecord for openorgs organizations -->
<!-- copy openorgs to the working dir (in the organization_deduprecord dir)-->
<!--<action name="CopyOpenorgs">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Copy Openorgs Entities</name>-->
<!--<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgs</class>-->
<!--<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;graphBasePath</arg><arg>${graphBasePath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>-->
<!--</spark>-->
<!--<ok to="UpdateEntity"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<action name="UpdateEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>

View File

@ -12,10 +12,14 @@ import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
@ -148,9 +152,14 @@ public class SparkOpenorgsTest implements Serializable {
long orgs_mergerel = spark
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
Dataset<Relation> orgrels = spark.read().load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization")).as(Encoders.bean(Relation.class));
for (Relation r: orgrels.toJavaRDD().collect())
System.out.println("r = " + r.getSource() + "---" + r.getTarget() + "---" + r.getRelClass());
assertEquals(384, orgs_mergerel);
}

View File

@ -180,14 +180,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
log.info("Processing Openorgs Merge Rels...");
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgSimRels);
//TODO cambiare il mapping delle relazioni in modo che crei merges e isMergedIn
// TODO (specifico per questo caso, questa funzione di mapping verrà usata così com'è nel caso di openorgs dedup
break;
case openaire_organizations:
log.info("Processing Organizations...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix);
break;
}
log.info("All done.");

View File

@ -50,4 +50,7 @@ GROUP BY
o.trust,
d.id,
d.officialname,
o.country;
o.country;
-- TODO modificare in modo da fare il merge dei pid di tutti i record mergiati (per gli openorgs, approvati)
-- TODO invece per tutti gli altri con dei duplicati non fare questa cosa