forked from D-Net/dnet-hadoop
minor changes and bug fix
This commit is contained in:
parent
28c1cdd132
commit
2355cc4e9b
|
@ -13,6 +13,11 @@ public class OrganizationPidComparator implements Comparator<StructuredProperty>
|
|||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(PidType.openorgs))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.openorgs))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.GRID))
|
||||
|
|
|
@ -9,7 +9,7 @@ public enum PidType {
|
|||
doi, pmid, pmc, handle, arXiv, nct, pdb,
|
||||
|
||||
// Organization
|
||||
GRID, mag_id, urn,
|
||||
openorgs, corda, corda_h2020, GRID, mag_id, urn,
|
||||
|
||||
// Used by dedup
|
||||
undefined, original;
|
||||
|
|
|
@ -99,6 +99,10 @@ abstract class AbstractSparkAction implements Serializable {
|
|||
dataset.write().option("compression", "gzip").mode(mode).json(outPath);
|
||||
}
|
||||
|
||||
protected static <T> void saveParquet(Dataset<T> dataset, String outPath, SaveMode mode) {
|
||||
dataset.write().option("compression", "gzip").mode(mode).parquet(outPath);
|
||||
}
|
||||
|
||||
protected static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
|
|
@ -89,7 +89,7 @@ public class DedupRecordFactory {
|
|||
t -> {
|
||||
T duplicate = t._2();
|
||||
|
||||
// prepare the list of pids to use for the id generation
|
||||
// prepare the list of pids to be used for the id generation
|
||||
bestPids.add(Identifier.newInstance(duplicate));
|
||||
|
||||
entity.mergeFrom(duplicate);
|
||||
|
|
|
@ -36,7 +36,14 @@ public class IdGenerator implements Serializable {
|
|||
}
|
||||
|
||||
private static String dedupify(String ns) {
|
||||
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||
|
||||
StringBuilder prefix;
|
||||
if (PidType.valueOf(substringBefore(ns, "_")) == PidType.openorgs) {
|
||||
prefix = new StringBuilder(substringBefore(ns, "_"));
|
||||
} else {
|
||||
prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||
}
|
||||
|
||||
while (prefix.length() < 12) {
|
||||
prefix.append("_");
|
||||
}
|
||||
|
|
|
@ -23,12 +23,13 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import net.sf.saxon.ma.trie.Tuple2;
|
||||
|
||||
//copy simrels (verified) from relation to the workdir in order to make them available for the deduplication
|
||||
public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgsMergeRels.class);
|
||||
public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup";
|
||||
|
@ -84,24 +85,32 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
|||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||
.toJavaRDD()
|
||||
.filter(this::isOpenorgs)
|
||||
.filter(this::filterOpenorgsRels)
|
||||
.filter(this::excludeOpenorgsMesh)
|
||||
.filter(this::excludeNonOpenorgs); // excludes relations with no openorgs id involved
|
||||
.filter(this::filterOpenorgsRels);
|
||||
|
||||
JavaRDD<Relation> selfRawRels = rawRels
|
||||
.map(r -> r.getSource())
|
||||
.distinct()
|
||||
.map(s -> rel(s, s, "isSimilarTo", dedupConf));
|
||||
|
||||
log.info("Number of raw Openorgs Relations collected: {}", rawRels.count());
|
||||
|
||||
// turn openorgs isSimilarTo relations into mergerels
|
||||
JavaRDD<Relation> mergeRelsRDD = rawRels.flatMap(rel -> {
|
||||
List<Relation> mergerels = new ArrayList<>();
|
||||
JavaRDD<Relation> mergeRelsRDD = rawRels
|
||||
.union(selfRawRels)
|
||||
.map(r -> {
|
||||
r.setSource(createDedupID(r.getSource())); // create the dedup_id to align it to the openaire dedup
|
||||
// format
|
||||
return r;
|
||||
})
|
||||
.flatMap(rel -> {
|
||||
|
||||
String openorgsId = rel.getSource().contains("openorgs____") ? rel.getSource() : rel.getTarget();
|
||||
String mergedId = rel.getSource().contains("openorgs____") ? rel.getTarget() : rel.getSource();
|
||||
List<Relation> mergerels = new ArrayList<>();
|
||||
|
||||
mergerels.add(rel(openorgsId, mergedId, "merges", dedupConf));
|
||||
mergerels.add(rel(mergedId, openorgsId, "isMergedIn", dedupConf));
|
||||
mergerels.add(rel(rel.getSource(), rel.getTarget(), "merges", dedupConf));
|
||||
mergerels.add(rel(rel.getTarget(), rel.getSource(), "isMergedIn", dedupConf));
|
||||
|
||||
return mergerels.iterator();
|
||||
});
|
||||
return mergerels.iterator();
|
||||
});
|
||||
|
||||
log.info("Number of Openorgs Merge Relations created: {}", mergeRelsRDD.count());
|
||||
|
||||
|
@ -144,22 +153,6 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
|||
return false;
|
||||
}
|
||||
|
||||
private boolean excludeOpenorgsMesh(Relation rel) {
|
||||
|
||||
if (rel.getSource().contains("openorgsmesh") || rel.getTarget().contains("openorgsmesh")) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean excludeNonOpenorgs(Relation rel) {
|
||||
|
||||
if (rel.getSource().contains("openorgs____") || rel.getTarget().contains("openorgs____")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) {
|
||||
|
||||
String entityType = dedupConf.getWf().getEntityType();
|
||||
|
@ -189,4 +182,10 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
|||
r.setDataInfo(info);
|
||||
return r;
|
||||
}
|
||||
|
||||
public String createDedupID(String id) {
|
||||
|
||||
String prefix = id.split("\\|")[0];
|
||||
return prefix + "|dedup_wf_001::" + DHPUtils.md5(id);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -82,7 +82,7 @@ public class SparkCopyOpenorgsSimRels extends AbstractSparkAction {
|
|||
.map(patchRelFn(), Encoders.bean(Relation.class))
|
||||
.filter(this::filterOpenorgsRels);
|
||||
|
||||
save(rawRels, outputPath, SaveMode.Append);
|
||||
saveParquet(rawRels, outputPath, SaveMode.Append);
|
||||
|
||||
log.info("Copied " + rawRels.count() + " Similarity Relations");
|
||||
}
|
||||
|
|
|
@ -109,7 +109,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
|||
.rdd(),
|
||||
Encoders.bean(Relation.class));
|
||||
|
||||
save(simRels, outputPath, SaveMode.Append);
|
||||
saveParquet(simRels, outputPath, SaveMode.Append);
|
||||
|
||||
log.info("Generated " + simRels.count() + " Similarity Relations");
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
|
@ -22,11 +23,10 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
@ -103,12 +103,22 @@ public class SparkUpdateEntity extends AbstractSparkAction {
|
|||
MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
|
||||
JavaRDD<String> map = entitiesWithId
|
||||
.leftOuterJoin(mergedIds)
|
||||
.map(
|
||||
k -> k._2()._2().isPresent()
|
||||
? updateDeletedByInference(k._2()._1(), clazz)
|
||||
: k._2()._1());
|
||||
.map(k -> {
|
||||
if (k._2()._2().isPresent()) {
|
||||
return updateDeletedByInference(k._2()._1(), clazz);
|
||||
}
|
||||
return k._2()._1();
|
||||
});
|
||||
|
||||
if (type == EntityType.organization) // exclude openorgs with deletedbyinference=true
|
||||
map = map.filter(it -> {
|
||||
Organization org = OBJECT_MAPPER.readValue(it, Organization.class);
|
||||
return !org.getId().contains("openorgs____") || (org.getId().contains("openorgs____")
|
||||
&& !org.getDataInfo().getDeletedbyinference());
|
||||
});
|
||||
|
||||
sourceEntity = map.union(sc.textFile(dedupRecordPath));
|
||||
|
||||
}
|
||||
|
||||
sourceEntity.saveAsTextFile(outputPath, GzipCodec.class);
|
||||
|
|
|
@ -83,7 +83,7 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resetWorkingPath"/>
|
||||
<start to="testOpenorgs"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -98,16 +98,14 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!--<action name="testOpenorgs">-->
|
||||
<!--<fs>-->
|
||||
<!--<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_simrel"/>-->
|
||||
<!--<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_mergerel"/>-->
|
||||
<!--<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_deduprecord"/>-->
|
||||
<!--<delete path="${dedupGraphPath}"/>-->
|
||||
<!--</fs>-->
|
||||
<!--<ok to="CopyOpenorgsMergeRels"/>-->
|
||||
<!--<error to="Kill"/>-->
|
||||
<!--</action>-->
|
||||
<action name="testOpenorgs">
|
||||
<fs>
|
||||
<delete path="${workingPath}/${actionSetIdOpenorgs}"/>
|
||||
<delete path="${dedupGraphPath}"/>
|
||||
</fs>
|
||||
<ok to="CopyOpenorgsMergeRels"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CreateSimRel">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
@ -213,17 +211,16 @@
|
|||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
||||
<arg>--numPartitions</arg><arg>8000</arg>
|
||||
</spark>
|
||||
<ok to="CopyOpenorgs"/>
|
||||
<ok to="CreateOrgsDedupRecord"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- copy openorgs to the working dir (in the organization_deduprecord dir)-->
|
||||
<action name="CopyOpenorgs">
|
||||
<action name="CreateOrgsDedupRecord">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Copy Openorgs Entities</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgs</class>
|
||||
<name>Create Organizations Dedup Records</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -237,12 +234,40 @@
|
|||
</spark-opts>
|
||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
||||
</spark>
|
||||
<ok to="UpdateEntity"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!--TODO replace with job for the creation of deduprecord for openorgs organizations -->
|
||||
<!-- copy openorgs to the working dir (in the organization_deduprecord dir)-->
|
||||
<!--<action name="CopyOpenorgs">-->
|
||||
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||
<!--<master>yarn</master>-->
|
||||
<!--<mode>cluster</mode>-->
|
||||
<!--<name>Copy Openorgs Entities</name>-->
|
||||
<!--<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgs</class>-->
|
||||
<!--<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>-->
|
||||
<!--<spark-opts>-->
|
||||
<!----executor-memory=${sparkExecutorMemory}-->
|
||||
<!----executor-cores=${sparkExecutorCores}-->
|
||||
<!----driver-memory=${sparkDriverMemory}-->
|
||||
<!----conf spark.extraListeners=${spark2ExtraListeners}-->
|
||||
<!----conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
|
||||
<!----conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
|
||||
<!----conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
|
||||
<!----conf spark.sql.shuffle.partitions=3840-->
|
||||
<!--</spark-opts>-->
|
||||
<!--<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>-->
|
||||
<!--<arg>--workingPath</arg><arg>${workingPath}</arg>-->
|
||||
<!--<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>-->
|
||||
<!--</spark>-->
|
||||
<!--<ok to="UpdateEntity"/>-->
|
||||
<!--<error to="Kill"/>-->
|
||||
<!--</action>-->
|
||||
|
||||
<action name="UpdateEntity">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
|
|
@ -112,7 +112,7 @@ public class EntityMergerTest implements Serializable {
|
|||
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
||||
|
||||
// verify authors
|
||||
assertEquals(9, pub_merged.getAuthor().size());
|
||||
assertEquals(13, pub_merged.getAuthor().size());
|
||||
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
||||
|
||||
// verify title
|
||||
|
|
|
@ -36,6 +36,8 @@ public class IdGeneratorTest {
|
|||
private static List<Identifier<Publication>> bestIds2;
|
||||
private static List<Identifier<Publication>> bestIds3;
|
||||
|
||||
private static List<Identifier<Organization>> bestIdsOrg;
|
||||
|
||||
private static String testEntityBasePath;
|
||||
|
||||
@BeforeAll
|
||||
|
@ -48,6 +50,8 @@ public class IdGeneratorTest {
|
|||
bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
|
||||
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
|
||||
|
||||
bestIdsOrg = createBestIds(testEntityBasePath + "/organization_idgeneration.json", Organization.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -76,6 +80,13 @@ public class IdGeneratorTest {
|
|||
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void generateIdOrganizationTest() {
|
||||
String id1 = IdGenerator.generate(bestIdsOrg, "20|defaultID");
|
||||
|
||||
assertEquals("20|openorgs____::599c15a70fcb03be6ba08f75f14d6076", id1);
|
||||
}
|
||||
|
||||
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
|
||||
final Stream<Identifier<T>> ids = readSample(path, clazz)
|
||||
.stream()
|
||||
|
|
|
@ -174,27 +174,27 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
long orgs_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
long pubs_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "publication"))
|
||||
.count();
|
||||
|
||||
long sw_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"))
|
||||
.count();
|
||||
|
||||
long ds_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "dataset"))
|
||||
.count();
|
||||
|
||||
long orp_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
|
||||
.count();
|
||||
|
||||
assertEquals(3082, orgs_simrel);
|
||||
|
@ -204,6 +204,7 @@ public class SparkDedupTest implements Serializable {
|
|||
assertEquals(6750, orp_simrel);
|
||||
}
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
@Order(2)
|
||||
public void collectSimRelsTest() throws Exception {
|
||||
|
@ -254,9 +255,15 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
long orp_simrel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||
.json(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||
.count();
|
||||
|
||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
||||
// System.out.println("sw_simrel = " + sw_simrel);
|
||||
// System.out.println("ds_simrel = " + ds_simrel);
|
||||
// System.out.println("orp_simrel = " + orp_simrel);
|
||||
|
||||
assertEquals(3672, orgs_simrel);
|
||||
assertEquals(10459, pubs_simrel);
|
||||
assertEquals(3767, sw_simrel);
|
||||
|
@ -456,7 +463,7 @@ public class SparkDedupTest implements Serializable {
|
|||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||
.count();
|
||||
|
||||
assertEquals(84, orgs_deduprecord);
|
||||
assertEquals(85, orgs_deduprecord);
|
||||
assertEquals(65, pubs_deduprecord);
|
||||
assertEquals(51, sw_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
|
@ -540,7 +547,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
assertEquals(896, publications);
|
||||
assertEquals(837, organizations);
|
||||
assertEquals(838, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
assertEquals(200, softwares);
|
||||
|
|
|
@ -110,6 +110,7 @@ public class SparkOpenorgsTest implements Serializable {
|
|||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
}
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void copyOpenorgsTest() throws Exception {
|
||||
|
||||
|
@ -162,7 +163,7 @@ public class SparkOpenorgsTest implements Serializable {
|
|||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||
.count();
|
||||
|
||||
assertEquals(6, orgs_mergerel);
|
||||
assertEquals(384, orgs_mergerel);
|
||||
|
||||
}
|
||||
|
||||
|
@ -191,7 +192,7 @@ public class SparkOpenorgsTest implements Serializable {
|
|||
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
||||
.count();
|
||||
|
||||
assertEquals(96, orgs_simrel);
|
||||
assertEquals(73, orgs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -163,14 +163,25 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
.execute(
|
||||
"queryProjectOrganization.sql", smdbe::processProjectOrganization, verifyNamespacePrefix);
|
||||
break;
|
||||
case openorgs:
|
||||
case openorgs_dedup:
|
||||
log.info("Processing Openorgs...");
|
||||
smdbe
|
||||
.execute(
|
||||
"queryOrganizationsFromOpenOrgsDB.sql", smdbe::processOrganization, verifyNamespacePrefix);
|
||||
"queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
|
||||
|
||||
log.info("Processing Openorgs Merge Rels...");
|
||||
smdbe.execute("querySimilarityFromOpenOrgsDB.sql", smdbe::processOrgOrgSimRels);
|
||||
smdbe.execute("queryOpenOrgsSimilarityForOrgsDedup.sql", smdbe::processOrgOrgSimRels);
|
||||
|
||||
break;
|
||||
|
||||
case openorgs:
|
||||
log.info("Processing Openorgs For Provision...");
|
||||
smdbe
|
||||
.execute(
|
||||
"queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
|
||||
|
||||
log.info("Processing Openorgs Merge Rels...");
|
||||
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgSimRels);
|
||||
|
||||
break;
|
||||
|
||||
|
@ -647,17 +658,19 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType(ORG_ORG_RELTYPE);
|
||||
r2.setSubRelType(ORG_ORG_SUBRELTYPE);
|
||||
r2.setRelClass(relClass);
|
||||
r2.setSource(orgId2);
|
||||
r2.setTarget(orgId1);
|
||||
r2.setCollectedfrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
// removed because there's no difference between two sides //TODO
|
||||
// final Relation r2 = new Relation();
|
||||
// r2.setRelType(ORG_ORG_RELTYPE);
|
||||
// r2.setSubRelType(ORG_ORG_SUBRELTYPE);
|
||||
// r2.setRelClass(relClass);
|
||||
// r2.setSource(orgId2);
|
||||
// r2.setTarget(orgId1);
|
||||
// r2.setCollectedfrom(collectedFrom);
|
||||
// r2.setDataInfo(info);
|
||||
// r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
// return Arrays.asList(r1, r2);
|
||||
|
||||
return Arrays.asList(r1, r2);
|
||||
return Arrays.asList(r1);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
|
|
@ -4,7 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
|||
//enum to specify the different actions available for the MigrateDbEntitiesApplication job
|
||||
public enum MigrateAction {
|
||||
claims, // migrate claims to the raw graph
|
||||
openorgs, // migrate organizations from openorgs to the raw graph
|
||||
openorgs_dedup, // migrate organizations from openorgs to the raw graph
|
||||
openorgs, // migrate organization from openorgs to the raw graph for provision
|
||||
openaire, // migrate openaire entities to the raw graph
|
||||
openaire_organizations // migrate openaire organizations entities to the raw graph
|
||||
}
|
||||
|
|
|
@ -156,7 +156,7 @@
|
|||
<arg>--postgresUser</arg><arg>${postgresOpenOrgsUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresOpenOrgsPassword}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--action</arg><arg>openorgs</arg>
|
||||
<arg>--action</arg><arg>openorgs_dedup</arg>
|
||||
<arg>--dbschema</arg><arg>${dbSchema}</arg>
|
||||
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
|
||||
</java>
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
SELECT
|
||||
o.id AS organizationid,
|
||||
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
|
||||
o.name AS legalname,
|
||||
array_agg(DISTINCT n.name) AS "alternativeNames",
|
||||
(array_agg(u.url))[1] AS websiteurl,
|
||||
'' AS logourl,
|
||||
o.creation_date AS dateofcollection,
|
||||
o.modification_date AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.95 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
o.country || '@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid,
|
||||
null AS eclegalbody,
|
||||
null AS eclegalperson,
|
||||
null AS ecnonprofit,
|
||||
null AS ecresearchorganization,
|
||||
null AS echighereducation,
|
||||
null AS ecinternationalorganizationeurinterests,
|
||||
null AS ecinternationalorganization,
|
||||
null AS ecenterprise,
|
||||
null AS ecsmevalidated,
|
||||
null AS ecnutscode
|
||||
FROM organizations o
|
||||
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
|
||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
|
||||
LEFT OUTER JOIN other_names n ON (n.id = o.id)
|
||||
WHERE
|
||||
o.status = 'approved'
|
||||
GROUP BY
|
||||
o.id,
|
||||
o.name,
|
||||
o.creation_date,
|
||||
o.modification_date,
|
||||
o.country;
|
|
@ -23,7 +23,7 @@ SELECT
|
|||
false AS deletedbyinference,
|
||||
0.99 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'isSimilarTo' AS relclass
|
||||
'isSimilarTo' AS relclass
|
||||
FROM other_names n
|
||||
LEFT OUTER JOIN organizations o ON (n.id = o.id)
|
||||
|
||||
|
@ -41,7 +41,3 @@ SELECT
|
|||
'' AS inferenceprovenance,
|
||||
'isDifferentFrom' AS relclass
|
||||
FROM oa_duplicates WHERE reltype = 'is_different'
|
||||
|
||||
|
||||
--TODO ???
|
||||
--Creare relazioni isDifferentFrom anche tra i suggerimenti: (A is_similar B) and (A is_different C) => (B is_different C)
|
|
@ -0,0 +1,12 @@
|
|||
-- relations approved by the user and suggested by the dedup
|
||||
SELECT
|
||||
local_id AS id1,
|
||||
oa_original_id AS id2,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.99 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'isSimilarTo' AS relclass
|
||||
FROM oa_duplicates WHERE reltype = 'is_similar' OR reltype = 'suggested';
|
Loading…
Reference in New Issue