addition of pids in the query for the export of openorgs for the provision, addition of ec_fields in the openorgs model

This commit is contained in:
miconis 2021-04-07 14:27:43 +02:00
parent eaaefb8b4c
commit bf685d849f
13 changed files with 270 additions and 127 deletions

View File

@ -142,4 +142,12 @@ abstract class AbstractSparkAction implements Serializable {
.isPresent())
.orElse(false);
}
protected static Boolean parseECField(Field<String> field) {
if (field == null)
return null;
if (StringUtils.isBlank(field.getValue()) || field.getValue().equalsIgnoreCase("null"))
return null;
return field.getValue().equalsIgnoreCase("true");
}
}

View File

@ -78,14 +78,14 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation");
//collect organization merge relations from openorgs database
// collect organization merge relations from openorgs database
JavaRDD<Relation> mergeRelsRDD = spark
.read()
.textFile(relationPath)
.map(patchRelFn(), Encoders.bean(Relation.class))
.toJavaRDD()
.filter(this::isOpenorgs) //take only openorgs relations
.filter(this::isMergeRel); //take merges and isMergedIn relations
.filter(this::isOpenorgs) // take only openorgs relations
.filter(this::isMergeRel); // take merges and isMergedIn relations
log.info("Number of Openorgs Merge Relations collected: {}", mergeRelsRDD.count());
@ -99,7 +99,8 @@ public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
}
private boolean isMergeRel(Relation rel) {
return (rel.getRelClass().equals(ModelConstants.MERGES) || rel.getRelClass().equals(ModelConstants.IS_MERGED_IN))
return (rel.getRelClass().equals(ModelConstants.MERGES)
|| rel.getRelClass().equals(ModelConstants.IS_MERGED_IN))
&& rel.getRelType().equals(ModelConstants.ORG_ORG_RELTYPE)
&& rel.getSubRelType().equals(ModelConstants.DEDUP);
}

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -90,13 +90,14 @@ public class SparkCreateOrgsDedupRecord extends AbstractSparkAction {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<String, Organization> entities = sc.textFile(entitiesInputPath)
JavaPairRDD<String, Organization> entities = sc
.textFile(entitiesInputPath)
.map(it -> OBJECT_MAPPER.readValue(it, Organization.class))
.mapToPair(o -> new Tuple2<>(o.getId(), o));
log.info("Number of organization entities processed: {}", entities.count());
//collect root ids (ids in the source of 'merges' relations
// collect root ids (ids in the source of 'merges' relations
JavaPairRDD<String, String> roots = spark
.read()
.load(mergeRelsPath)
@ -109,10 +110,11 @@ public class SparkCreateOrgsDedupRecord extends AbstractSparkAction {
.mapToPair(t -> t)
.distinct();
Dataset<Organization> rootOrgs = spark.createDataset(
Dataset<Organization> rootOrgs = spark
.createDataset(
entities
.leftOuterJoin(roots)
.filter(e -> e._2()._2().isPresent()) //if it has been joined with 'root' then it's a root record
.filter(e -> e._2()._2().isPresent()) // if it has been joined with 'root' then it's a root record
.map(e -> e._2()._1())
.rdd(),
Encoders.bean(Organization.class));

View File

@ -183,7 +183,17 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
r._1()._2().getCollectedfrom().get(0).getValue(),
"",
structuredPropertyListToString(r._1()._2().getPid())),
structuredPropertyListToString(r._1()._2().getPid()),
parseECField(r._1()._2().getEclegalbody()),
parseECField(r._1()._2().getEclegalperson()),
parseECField(r._1()._2().getEcnonprofit()),
parseECField(r._1()._2().getEcresearchorganization()),
parseECField(r._1()._2().getEchighereducation()),
parseECField(r._1()._2().getEcinternationalorganizationeurinterests()),
parseECField(r._1()._2().getEcinternationalorganization()),
parseECField(r._1()._2().getEcenterprise()),
parseECField(r._1()._2().getEcsmevalidated()),
parseECField(r._1()._2().getEcnutscode())),
Encoders.bean(OrgSimRel.class));
}

View File

@ -224,7 +224,17 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
.map(c -> Optional.ofNullable(c.get(0)).map(KeyValue::getValue).orElse(""))
.orElse(""),
r._1()._3(),
structuredPropertyListToString(o.getPid()));
structuredPropertyListToString(o.getPid()),
parseECField(o.getEclegalbody()),
parseECField(o.getEclegalperson()),
parseECField(o.getEcnonprofit()),
parseECField(o.getEcresearchorganization()),
parseECField(o.getEchighereducation()),
parseECField(o.getEcinternationalorganizationeurinterests()),
parseECField(o.getEcinternationalorganization()),
parseECField(o.getEcenterprise()),
parseECField(o.getEcsmevalidated()),
parseECField(o.getEcnutscode()));
},
Encoders.bean(OrgSimRel.class))
.map(
@ -301,7 +311,17 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
r._2()._2().getWebsiteurl() != null ? r._2()._2().getWebsiteurl().getValue() : "",
r._2()._2().getCollectedfrom().get(0).getValue(),
"group::" + r._1()._1(),
structuredPropertyListToString(r._2()._2().getPid())),
structuredPropertyListToString(r._2()._2().getPid()),
parseECField(r._2()._2().getEclegalbody()),
parseECField(r._2()._2().getEclegalperson()),
parseECField(r._2()._2().getEcnonprofit()),
parseECField(r._2()._2().getEcresearchorganization()),
parseECField(r._2()._2().getEchighereducation()),
parseECField(r._2()._2().getEcinternationalorganizationeurinterests()),
parseECField(r._2()._2().getEcinternationalorganization()),
parseECField(r._2()._2().getEcenterprise()),
parseECField(r._2()._2().getEcsmevalidated()),
parseECField(r._2()._2().getEcnutscode())),
Encoders.bean(OrgSimRel.class))
.map(
(MapFunction<OrgSimRel, Tuple2<String, OrgSimRel>>) o -> new Tuple2<>(o.getLocal_id(), o),

View File

@ -101,7 +101,7 @@ public class SparkUpdateEntity extends AbstractSparkAction {
.mapToPair(
(PairFunction<String, String, String>) s -> new Tuple2<>(
MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
if (type == EntityType.organization) //exclude root records from organizations
if (type == EntityType.organization) // exclude root records from organizations
entitiesWithId = excludeRootOrgs(entitiesWithId, rel);
JavaRDD<String> map = entitiesWithId
@ -156,7 +156,8 @@ public class SparkUpdateEntity extends AbstractSparkAction {
}
}
private static JavaPairRDD<String, String> excludeRootOrgs(JavaPairRDD<String, String> entitiesWithId, Dataset<Relation> rel) {
private static JavaPairRDD<String, String> excludeRootOrgs(JavaPairRDD<String, String> entitiesWithId,
Dataset<Relation> rel) {
JavaPairRDD<String, String> roots = rel
.where("relClass == 'merges'")

View File

@ -14,12 +14,25 @@ public class OrgSimRel implements Serializable {
String oa_collectedfrom;
String group_id;
String pid_list; // separator for type-pid: "###"; separator for pids: "@@@"
Boolean ec_legalbody;
Boolean ec_legalperson;
Boolean ec_nonprofit;
Boolean ec_researchorganization;
Boolean ec_highereducation;
Boolean ec_internationalorganizationeurinterests;
Boolean ec_internationalorganization;
Boolean ec_enterprise;
Boolean ec_smevalidated;
Boolean ec_nutscode;
public OrgSimRel() {
}
public OrgSimRel(String local_id, String oa_original_id, String oa_name, String oa_acronym, String oa_country,
String oa_url, String oa_collectedfrom, String group_id, String pid_list) {
String oa_url, String oa_collectedfrom, String group_id, String pid_list, Boolean ec_legalbody,
Boolean ec_legalperson, Boolean ec_nonprofit, Boolean ec_researchorganization, Boolean ec_highereducation,
Boolean ec_internationalorganizationeurinterests, Boolean ec_internationalorganization, Boolean ec_enterprise,
Boolean ec_smevalidated, Boolean ec_nutscode) {
this.local_id = local_id;
this.oa_original_id = oa_original_id;
this.oa_name = oa_name;
@ -29,6 +42,16 @@ public class OrgSimRel implements Serializable {
this.oa_collectedfrom = oa_collectedfrom;
this.group_id = group_id;
this.pid_list = pid_list;
this.ec_legalbody = ec_legalbody;
this.ec_legalperson = ec_legalperson;
this.ec_nonprofit = ec_nonprofit;
this.ec_researchorganization = ec_researchorganization;
this.ec_highereducation = ec_highereducation;
this.ec_internationalorganizationeurinterests = ec_internationalorganizationeurinterests;
this.ec_internationalorganization = ec_internationalorganization;
this.ec_enterprise = ec_enterprise;
this.ec_smevalidated = ec_smevalidated;
this.ec_nutscode = ec_nutscode;
}
public String getLocal_id() {
@ -103,6 +126,86 @@ public class OrgSimRel implements Serializable {
this.pid_list = pid_list;
}
public Boolean getEc_legalbody() {
return ec_legalbody;
}
public void setEc_legalbody(Boolean ec_legalbody) {
this.ec_legalbody = ec_legalbody;
}
public Boolean getEc_legalperson() {
return ec_legalperson;
}
public void setEc_legalperson(Boolean ec_legalperson) {
this.ec_legalperson = ec_legalperson;
}
public Boolean getEc_nonprofit() {
return ec_nonprofit;
}
public void setEc_nonprofit(Boolean ec_nonprofit) {
this.ec_nonprofit = ec_nonprofit;
}
public Boolean getEc_researchorganization() {
return ec_researchorganization;
}
public void setEc_researchorganization(Boolean ec_researchorganization) {
this.ec_researchorganization = ec_researchorganization;
}
public Boolean getEc_highereducation() {
return ec_highereducation;
}
public void setEc_highereducation(Boolean ec_highereducation) {
this.ec_highereducation = ec_highereducation;
}
public Boolean getEc_internationalorganizationeurinterests() {
return ec_internationalorganizationeurinterests;
}
public void setEc_internationalorganizationeurinterests(Boolean ec_internationalorganizationeurinterests) {
this.ec_internationalorganizationeurinterests = ec_internationalorganizationeurinterests;
}
public Boolean getEc_internationalorganization() {
return ec_internationalorganization;
}
public void setEc_internationalorganization(Boolean ec_internationalorganization) {
this.ec_internationalorganization = ec_internationalorganization;
}
public Boolean getEc_enterprise() {
return ec_enterprise;
}
public void setEc_enterprise(Boolean ec_enterprise) {
this.ec_enterprise = ec_enterprise;
}
public Boolean getEc_smevalidated() {
return ec_smevalidated;
}
public void setEc_smevalidated(Boolean ec_smevalidated) {
this.ec_smevalidated = ec_smevalidated;
}
public Boolean getEc_nutscode() {
return ec_nutscode;
}
public void setEc_nutscode(Boolean ec_nutscode) {
this.ec_nutscode = ec_nutscode;
}
@Override
public String toString() {
return "OrgSimRel{" +
@ -115,6 +218,17 @@ public class OrgSimRel implements Serializable {
", oa_collectedfrom='" + oa_collectedfrom + '\'' +
", group_id='" + group_id + '\'' +
", pid_list='" + pid_list + '\'' +
", ec_legalbody=" + ec_legalbody +
", ec_legalperson=" + ec_legalperson +
", ec_nonprofit=" + ec_nonprofit +
", ec_researchorganization=" + ec_researchorganization +
", ec_highereducation=" + ec_highereducation +
", ec_internationalorganizationeurinterests=" + ec_internationalorganizationeurinterests +
", ec_internationalorganization=" + ec_internationalorganization +
", ec_enterprise=" + ec_enterprise +
", ec_smevalidated=" + ec_smevalidated +
", ec_nutscode=" + ec_nutscode +
'}';
}
}

View File

@ -12,7 +12,6 @@ import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -31,6 +30,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -102,34 +102,6 @@ public class SparkOpenorgsTest implements Serializable {
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
}
@Disabled
@Test
public void copyOpenorgsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCopyOpenorgs.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/copyOpenorgs_parameters.json")));
parser
.parseArgument(
new String[] {
"-i", testGraphBasePath,
"-asi", testActionSetId,
"-w", testOutputBasePath,
"-np", "50"
});
new SparkCopyOpenorgs(parser, spark).run(isLookUpService);
long orgs_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord")
.count();
assertEquals(100, orgs_deduprecord);
}
@Test
public void copyOpenorgsMergeRels() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -155,9 +127,12 @@ public class SparkOpenorgsTest implements Serializable {
.load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
Dataset<Relation> orgrels = spark.read().load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization")).as(Encoders.bean(Relation.class));
Dataset<Relation> orgrels = spark
.read()
.load(DedupUtility.createMergeRelPath(testOutputBasePath, testActionSetId, "organization"))
.as(Encoders.bean(Relation.class));
for (Relation r: orgrels.toJavaRDD().collect())
for (Relation r : orgrels.toJavaRDD().collect())
System.out.println("r = " + r.getSource() + "---" + r.getTarget() + "---" + r.getRelClass());
assertEquals(384, orgs_mergerel);

View File

@ -1,28 +1,7 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
@ -161,27 +140,24 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
.execute(
"queryProjectOrganization.sql", smdbe::processProjectOrganization, verifyNamespacePrefix);
break;
case openorgs_dedup: //generates organization entities and relations for openorgs dedup
case openorgs_dedup: // generates organization entities and relations for openorgs dedup
log.info("Processing Openorgs...");
smdbe
.execute(
"queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Merge Rels...");
log.info("Processing Openorgs Sim Rels...");
smdbe.execute("queryOpenOrgsSimilarityForOrgsDedup.sql", smdbe::processOrgOrgSimRels);
break;
case openorgs: //generates organization entities and relations for provision
case openorgs: // generates organization entities and relations for provision
log.info("Processing Openorgs For Provision...");
smdbe
.execute(
"queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Merge Rels...");
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgSimRels);
//TODO cambiare il mapping delle relazioni in modo che crei merges e isMergedIn
// TODO (specifico per questo caso, questa funzione di mapping verrà usata così com'è nel caso di openorgs dedup
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgMergeRels);
break;
case openaire_organizations:
@ -635,6 +611,41 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
}
}
public List<Oaf> processOrgOrgMergeRels(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs); // TODO
final String orgId1 = createOpenaireId(20, rs.getString("id1"), true);
final String orgId2 = createOpenaireId(20, rs.getString("id2"), true);
final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType(ORG_ORG_RELTYPE);
r1.setSubRelType(ModelConstants.DEDUP);
r1.setRelClass(MERGES);
r1.setSource(orgId1);
r1.setTarget(orgId2);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation();
r2.setRelType(ORG_ORG_RELTYPE);
r2.setSubRelType(ModelConstants.DEDUP);
r2.setRelClass(IS_MERGED_IN);
r2.setSource(orgId2);
r2.setTarget(orgId1);
r2.setCollectedfrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processOrgOrgSimRels(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs); // TODO
@ -647,7 +658,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType(ModelConstants.ORG_ORG_RELTYPE);
r1.setRelType(ORG_ORG_RELTYPE);
r1.setSubRelType(ModelConstants.DEDUP);
r1.setRelClass(relClass);
r1.setSource(orgId1);

View File

@ -60,20 +60,22 @@ SELECT
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid,
null AS eclegalbody,
null AS eclegalperson,
null AS ecnonprofit,
null AS ecresearchorganization,
null AS echighereducation,
null AS ecinternationalorganizationeurinterests,
null AS ecinternationalorganization,
null AS ecenterprise,
null AS ecsmevalidated,
null AS ecnutscode
(array_remove(array_cat(ARRAY[o.ec_legalbody], array_agg(od.ec_legalbody)), NULL))[1] AS eclegalbody,
(array_remove(array_cat(ARRAY[o.ec_legalperson], array_agg(od.ec_legalperson)), NULL))[1] AS eclegalperson,
(array_remove(array_cat(ARRAY[o.ec_nonprofit], array_agg(od.ec_nonprofit)), NULL))[1] AS ecnonprofit,
(array_remove(array_cat(ARRAY[o.ec_researchorganization], array_agg(od.ec_researchorganization)), NULL))[1] AS ecresearchorganization,
(array_remove(array_cat(ARRAY[o.ec_highereducation], array_agg(od.ec_highereducation)), NULL))[1] AS echighereducation,
(array_remove(array_cat(ARRAY[o.ec_internationalorganizationeurinterests], array_agg(od.ec_internationalorganizationeurinterests)), NULL))[1] AS ecinternationalorganizationeurinterests,
(array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1] AS ecinternationalorganization,
(array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1] AS ecenterprise,
(array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1] AS ecsmevalidated,
(array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id)
LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
WHERE
o.status = 'approved'
GROUP BY

View File

@ -15,22 +15,25 @@ SELECT
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid,
null AS eclegalbody,
null AS eclegalperson,
null AS ecnonprofit,
null AS ecresearchorganization,
null AS echighereducation,
null AS ecinternationalorganizationeurinterests,
null AS ecinternationalorganization,
null AS ecenterprise,
null AS ecsmevalidated,
null AS ecnutscode
array_remove(array_cat(array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types'), array_agg(DISTINCT idup.otherid || '###' || idup.type || '@@@dnet:pid_types')), NULL) AS pid,
(array_remove(array_cat(ARRAY[o.ec_legalbody], array_agg(od.ec_legalbody)), NULL))[1] AS eclegalbody,
(array_remove(array_cat(ARRAY[o.ec_legalperson], array_agg(od.ec_legalperson)), NULL))[1] AS eclegalperson,
(array_remove(array_cat(ARRAY[o.ec_nonprofit], array_agg(od.ec_nonprofit)), NULL))[1] AS ecnonprofit,
(array_remove(array_cat(ARRAY[o.ec_researchorganization], array_agg(od.ec_researchorganization)), NULL))[1] AS ecresearchorganization,
(array_remove(array_cat(ARRAY[o.ec_highereducation], array_agg(od.ec_highereducation)), NULL))[1] AS echighereducation,
(array_remove(array_cat(ARRAY[o.ec_internationalorganizationeurinterests], array_agg(od.ec_internationalorganizationeurinterests)), NULL))[1] AS ecinternationalorganizationeurinterests,
(array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1] AS ecinternationalorganization,
(array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1] AS ecenterprise,
(array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1] AS ecsmevalidated,
(array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode
FROM organizations o
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
LEFT OUTER JOIN other_names n ON (n.id = o.id)
LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
LEFT OUTER JOIN other_ids idup ON (od.id = idup.id)
WHERE
o.status = 'approved'
GROUP BY

View File

@ -7,6 +7,5 @@ SELECT
false AS inferred,
false AS deletedbyinference,
0.99 AS trust,
'' AS inferenceprovenance,
'isSimilarTo' AS relclass
'' AS inferenceprovenance
FROM oa_duplicates WHERE reltype = 'is_similar' OR reltype = 'suggested';

View File

@ -51,6 +51,3 @@ GROUP BY
d.id,
d.officialname,
o.country;
-- TODO modificare in modo da fare il merge dei pid di tutti i record mergiati (per gli openorgs, approvati)
-- TODO invece per tutti gli altri con dei duplicati non fare questa cosa