forked from D-Net/dnet-hadoop
modification of the jobs for the integration of openorgs in the provision, dedup records are no more created by merging but simply taking results of openorgs portal
This commit is contained in:
parent
1e7e5180fa
commit
c39c82dfe9
|
@ -78,42 +78,16 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
|||
|
||||
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
|
||||
|
||||
DedupConfig dedupConf = getConfigurations(isLookUpService, actionSetId).get(0);
|
||||
|
||||
JavaRDD<Relation> rawRels = spark
|
||||
//collect organization merge relations from openorgs database
|
||||
JavaRDD<Relation> mergeRelsRDD = spark
|
||||
.read()
|
||||
.textFile(relationPath)
|
||||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||
.toJavaRDD()
|
||||
.filter(this::isOpenorgs)
|
||||
.filter(this::filterOpenorgsRels);
|
||||
.filter(this::isOpenorgs) //take only openorgs relations
|
||||
.filter(this::isMergeRel); //take merges and isMergedIn relations
|
||||
|
||||
JavaRDD<Relation> selfRawRels = rawRels
|
||||
.map(r -> r.getSource())
|
||||
.distinct()
|
||||
.map(s -> rel(s, s, ModelConstants.IS_SIMILAR_TO, dedupConf));
|
||||
|
||||
log.info("Number of raw Openorgs Relations collected: {}", rawRels.count());
|
||||
|
||||
// turn openorgs isSimilarTo relations into mergerels
|
||||
JavaRDD<Relation> mergeRelsRDD = rawRels
|
||||
.union(selfRawRels)
|
||||
.map(r -> {
|
||||
r.setSource(createDedupID(r.getSource())); // create the dedup_id to align it to the openaire dedup
|
||||
// format
|
||||
return r;
|
||||
})
|
||||
.flatMap(rel -> {
|
||||
|
||||
List<Relation> mergerels = new ArrayList<>();
|
||||
|
||||
mergerels.add(rel(rel.getSource(), rel.getTarget(), ModelConstants.MERGES, dedupConf));
|
||||
mergerels.add(rel(rel.getTarget(), rel.getSource(), ModelConstants.IS_MERGED_IN, dedupConf));
|
||||
|
||||
return mergerels.iterator();
|
||||
});
|
||||
|
||||
log.info("Number of Openorgs Merge Relations created: {}", mergeRelsRDD.count());
|
||||
log.info("Number of Openorgs Merge Relations collected: {}", mergeRelsRDD.count());
|
||||
|
||||
spark
|
||||
.createDataset(
|
||||
|
@ -124,45 +98,9 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
|||
.parquet(outputPath);
|
||||
}
|
||||
|
||||
private boolean filterOpenorgsRels(Relation rel) {
|
||||
return rel.getRelClass().equals(ModelConstants.IS_SIMILAR_TO)
|
||||
private boolean isMergeRel(Relation rel) {
|
||||
return (rel.getRelClass().equals(ModelConstants.MERGES) || rel.getRelClass().equals(ModelConstants.IS_MERGED_IN))
|
||||
&& rel.getRelType().equals(ModelConstants.ORG_ORG_RELTYPE)
|
||||
&& rel.getSubRelType().equals(ModelConstants.DEDUP);
|
||||
}
|
||||
|
||||
private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) {
|
||||
|
||||
String entityType = dedupConf.getWf().getEntityType();
|
||||
|
||||
Relation r = new Relation();
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setRelClass(relClass);
|
||||
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
|
||||
r.setSubRelType(ModelConstants.DEDUP);
|
||||
|
||||
DataInfo info = new DataInfo();
|
||||
info.setDeletedbyinference(false);
|
||||
info.setInferred(true);
|
||||
info.setInvisible(false);
|
||||
info.setInferenceprovenance(dedupConf.getWf().getConfigurationId());
|
||||
Qualifier provenanceAction = new Qualifier();
|
||||
provenanceAction.setClassid(ModelConstants.PROVENANCE_DEDUP);
|
||||
provenanceAction.setClassname(ModelConstants.PROVENANCE_DEDUP);
|
||||
provenanceAction.setSchemeid(ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
provenanceAction.setSchemename(ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
info.setProvenanceaction(provenanceAction);
|
||||
|
||||
// TODO calculate the trust value based on the similarity score of the elements in the CC
|
||||
// info.setTrust();
|
||||
|
||||
r.setDataInfo(info);
|
||||
return r;
|
||||
}
|
||||
|
||||
public String createDedupID(String id) {
|
||||
|
||||
String prefix = id.split("\\|")[0];
|
||||
return prefix + "|dedup_wf_001::" + DHPUtils.md5(id);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -24,11 +26,12 @@ import eu.dnetlib.dhp.schema.oaf.Organization;
|
|||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkCopyOpenorgs extends AbstractSparkAction {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgs.class);
|
||||
public class SparkCreateOrgsDedupRecord extends AbstractSparkAction {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCreateOrgsDedupRecord.class);
|
||||
|
||||
public SparkCopyOpenorgs(ArgumentApplicationParser parser, SparkSession spark) {
|
||||
public SparkCreateOrgsDedupRecord(ArgumentApplicationParser parser, SparkSession spark) {
|
||||
super(parser, spark);
|
||||
}
|
||||
|
||||
|
@ -36,13 +39,13 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
|
|||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCopyOpenorgs.class
|
||||
SparkCreateOrgsDedupRecord.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/copyOpenorgs_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
new SparkCopyOpenorgs(parser, getSparkSession(conf))
|
||||
new SparkCreateOrgsDedupRecord(parser, getSparkSession(conf))
|
||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||
}
|
||||
|
||||
|
@ -64,14 +67,15 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
|
|||
log.info("actionSetId: '{}'", actionSetId);
|
||||
log.info("workingPath: '{}'", workingPath);
|
||||
|
||||
String subEntity = "organization";
|
||||
log.info("Copying openorgs to the working dir");
|
||||
log.info("Copying organization dedup records to the working dir");
|
||||
|
||||
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity);
|
||||
final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, "organization");
|
||||
|
||||
final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity);
|
||||
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
|
||||
|
||||
filterOpenorgs(spark, entityPath)
|
||||
final String mergeRelsPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
|
||||
|
||||
rootOrganization(spark, entityPath, mergeRelsPath)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
@ -79,26 +83,43 @@ public class SparkCopyOpenorgs extends AbstractSparkAction {
|
|||
|
||||
}
|
||||
|
||||
public static Dataset<Organization> filterOpenorgs(
|
||||
public static Dataset<Organization> rootOrganization(
|
||||
final SparkSession spark,
|
||||
final String entitiesInputPath) {
|
||||
final String entitiesInputPath,
|
||||
final String mergeRelsPath) {
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
Dataset<Organization> entities = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(entitiesInputPath)
|
||||
|
||||
JavaPairRDD<String, Organization> entities = sc.textFile(entitiesInputPath)
|
||||
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
|
||||
.rdd(),
|
||||
Encoders.bean(Organization.class));
|
||||
.mapToPair(o -> new Tuple2<>(o.getId(), o));
|
||||
|
||||
log.info("Number of organization entities processed: {}", entities.count());
|
||||
|
||||
entities = entities.filter(entities.col("id").contains(DedupUtility.OPENORGS_ID_PREFIX));
|
||||
//collect root ids (ids in the source of 'merges' relations
|
||||
JavaPairRDD<String, String> roots = spark
|
||||
.read()
|
||||
.load(mergeRelsPath)
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.where("relClass == 'merges'")
|
||||
.map(
|
||||
(MapFunction<Relation, Tuple2<String, String>>) r -> new Tuple2<>(r.getSource(), "root"),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.toJavaRDD()
|
||||
.mapToPair(t -> t)
|
||||
.distinct();
|
||||
|
||||
log.info("Number of Openorgs organization entities: {}", entities.count());
|
||||
Dataset<Organization> rootOrgs = spark.createDataset(
|
||||
entities
|
||||
.leftOuterJoin(roots)
|
||||
.filter(e -> e._2()._2().isPresent()) //if it has been joined with 'root' then it's a root record
|
||||
.map(e -> e._2()._1())
|
||||
.rdd(),
|
||||
Encoders.bean(Organization.class));
|
||||
|
||||
return entities;
|
||||
log.info("Number of Root organization: {}", entities.count());
|
||||
|
||||
return rootOrgs;
|
||||
}
|
||||
|
||||
}
|
|
@ -101,6 +101,9 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
.mapToPair(
|
||||
(PairFunction<String, String, String>) s -> new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
|
||||
if (type == EntityType.organization) //exclude root records from organizations
|
||||
entitiesWithId = excludeRootOrgs(entitiesWithId, rel);
|
||||
|
||||
JavaRDD<String> map = entitiesWithId
|
||||
.leftOuterJoin(mergedIds)
|
||||
.map(k -> {
|
||||
|
@ -110,13 +113,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
return k._2()._1();
|
||||
});
|
||||
|
||||
if (type == EntityType.organization) // exclude openorgs with deletedbyinference=true
|
||||
map = map.filter(it -> {
|
||||
Organization org = OBJECT_MAPPER.readValue(it, Organization.class);
|
||||
return !org.getId().contains("openorgs____") || (org.getId().contains("openorgs____")
|
||||
&& !org.getDataInfo().getDeletedbyinference());
|
||||
});
|
||||
|
||||
sourceEntity = map.union(sc.textFile(dedupRecordPath));
|
||||
|
||||
}
|
||||
|
@ -159,4 +155,20 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
throw new RuntimeException("Unable to convert json", e);
|
||||
}
|
||||
}
|
||||
|
||||
private static JavaPairRDD<String, String> excludeRootOrgs(JavaPairRDD<String, String> entitiesWithId, Dataset<Relation> rel) {
|
||||
|
||||
JavaPairRDD<String, String> roots = rel
|
||||
.where("relClass == 'merges'")
|
||||
.select(rel.col("source"))
|
||||
.distinct()
|
||||
.toJavaRDD()
|
||||
.mapToPair(
|
||||
(PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "root"));
|
||||
|
||||
return entitiesWithId
|
||||
.leftOuterJoin(roots)
|
||||
.filter(e -> !e._2()._2().isPresent())
|
||||
.mapToPair(e -> new Tuple2<>(e._1(), e._2()._1()));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -187,7 +187,7 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- copy organization relations in the working dir (in the organization_mergerel dir)-->
|
||||
<!-- copy organization merge relations in the working dir (in the organization_mergerel dir)-->
|
||||
<action name="CopyOpenorgsMergeRels">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -220,7 +220,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Organizations Dedup Records</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -241,33 +241,6 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!--TODO replace with job for the creation of deduprecord for openorgs organizations -->
|
||||
<!-- copy openorgs to the working dir (in the organization_deduprecord dir)-->
|
||||
<!--<action name="CopyOpenorgs">-->
|
||||
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||
<!--<master>yarn</master>-->
|
||||
<!--<mode>cluster</mode>-->
|
||||
<!--<name>Copy Openorgs Entities</name>-->
|
||||
<!--<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgs</class>-->
|
||||
<!--<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>-->
|
||||
<!--<spark-opts>-->
|
||||
<!----executor-memory=${sparkExecutorMemory}-->
|
||||
<!----executor-cores=${sparkExecutorCores}-->
|
||||
<!----driver-memory=${sparkDriverMemory}-->
|
||||
<!----conf spark.extraListeners=${spark2ExtraListeners}-->
|
||||
<!----conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
|
||||
<!----conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
|
||||
<!----conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
|
||||
<!----conf spark.sql.shuffle.partitions=3840-->
|
||||
<!--</spark-opts>-->
|
||||
<!--<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>-->
|
||||
<!--<arg>--workingPath</arg><arg>${workingPath}</arg>-->
|
||||
<!--<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>-->
|
||||
<!--</spark>-->
|
||||
<!--<ok to="UpdateEntity"/>-->
|
||||
<!--<error to="Kill"/>-->
|
||||
<!--</action>-->
|
||||
|
||||
<action name="UpdateEntity">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
|
|
@ -12,10 +12,14 @@ import java.io.Serializable;
|
|||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
|
@ -148,9 +152,14 @@ public class SparkOpenorgsTest implements Serializable {
|
|||
|
||||
long orgs_mergerel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||
.load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
Dataset<Relation> orgrels = spark.read().load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization")).as(Encoders.bean(Relation.class));
|
||||
|
||||
for (Relation r: orgrels.toJavaRDD().collect())
|
||||
System.out.println("r = " + r.getSource() + "---" + r.getTarget() + "---" + r.getRelClass());
|
||||
|
||||
assertEquals(384, orgs_mergerel);
|
||||
|
||||
}
|
||||
|
|
|
@ -180,14 +180,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
|
||||
log.info("Processing Openorgs Merge Rels...");
|
||||
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgSimRels);
|
||||
|
||||
//TODO cambiare il mapping delle relazioni in modo che crei merges e isMergedIn
|
||||
// TODO (specifico per questo caso, questa funzione di mapping verrà usata così com'è nel caso di openorgs dedup
|
||||
break;
|
||||
|
||||
case openaire_organizations:
|
||||
|
||||
log.info("Processing Organizations...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix);
|
||||
|
||||
break;
|
||||
}
|
||||
log.info("All done.");
|
||||
|
|
|
@ -51,3 +51,6 @@ GROUP BY
|
|||
d.id,
|
||||
d.officialname,
|
||||
o.country;
|
||||
|
||||
-- TODO modificare in modo da fare il merge dei pid di tutti i record mergiati (per gli openorgs, approvati)
|
||||
-- TODO invece per tutti gli altri con dei duplicati non fare questa cosa
|
Loading…
Reference in New Issue