From 5130eac24747237213a80ce552c891128adf6366 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 24 Jun 2022 17:16:42 +0200 Subject: [PATCH 01/32] mapping by participant project contribution --- .../dhp/schema/oaf/utils/OafMapperUtils.java | 58 +++++++ .../raw/AbstractMdRecordToOafMapper.java | 52 ++---- .../raw/MigrateDbEntitiesApplication.java | 155 +++++------------- .../oa/graph/sql/queryProjectOrganization.sql | 1 + 4 files changed, 114 insertions(+), 152 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index ac9cfe3300..6f452e846f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -419,4 +419,62 @@ public class OafMapperUtils { m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo))); return m; } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final OafEntity entity) { + return getRelation(source, target, relType, subRelType, relClass, entity, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final OafEntity entity, + final String validationDate) { + return getRelation( + source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(), + entity.getLastupdatetimestamp(), validationDate, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final List collectedfrom, + final DataInfo dataInfo, + final Long lastupdatetimestamp) { + return getRelation( + source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final List collectedfrom, + final DataInfo dataInfo, + final Long lastupdatetimestamp, + final String validationDate, + final List properties) { + final Relation rel = new Relation(); + rel.setRelType(relType); + rel.setSubRelType(subRelType); + rel.setRelClass(relClass); + rel.setSource(source); + rel.setTarget(target); + rel.setCollectedfrom(collectedfrom); + rel.setDataInfo(dataInfo); + rel.setLastupdatetimestamp(lastupdatetimestamp); + rel.setValidated(StringUtils.isNotBlank(validationDate)); + rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null); + rel.setProperties(properties); + return rel; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 739be3df42..846440a696 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -267,10 +267,13 @@ public abstract class AbstractMdRecordToOafMapper { res .add( - getRelation( - docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); + OafMapperUtils + .getRelation( + docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); res - .add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); + .add( + OafMapperUtils + .getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); } } @@ -303,13 +306,16 @@ public abstract class AbstractMdRecordToOafMapper { final String targetId = createOpenaireId(targetType, target, true); rels .add( - getRelation( - entity.getId(), targetId, relType, subRelType, relClass, entity, validationdDate)); + OafMapperUtils + .getRelation( + entity.getId(), targetId, relType, subRelType, relClass, entity, + validationdDate)); rels .add( - getRelation( - targetId, entity.getId(), relType, subRelType, relClassInverse, entity, - validationdDate)); + OafMapperUtils + .getRelation( + targetId, entity.getId(), relType, subRelType, relClassInverse, entity, + validationdDate)); } } } @@ -317,36 +323,6 @@ public abstract class AbstractMdRecordToOafMapper { return rels; } - protected Relation getRelation(final String source, - final String target, - final String relType, - final String subRelType, - final String relClass, - final OafEntity entity) { - return getRelation(source, target, relType, subRelType, relClass, entity, null); - } - - protected Relation getRelation(final String source, - final String target, - final String relType, - final String subRelType, - final String relClass, - final OafEntity entity, - final String validationDate) { - final Relation rel = new Relation(); - rel.setRelType(relType); - rel.setSubRelType(subRelType); - rel.setRelClass(relClass); - rel.setSource(source); - rel.setTarget(target); - rel.setCollectedfrom(entity.getCollectedfrom()); - rel.setDataInfo(entity.getDataInfo()); - rel.setLastupdatetimestamp(entity.getLastupdatetimestamp()); - rel.setValidated(StringUtils.isNotBlank(validationDate)); - rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null); - return rel; - } - protected abstract List addOtherResultRels( final Document doc, final OafEntity entity); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 63fa8b7e02..8296e99cd2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -1,32 +1,7 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; -import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_MERGED_IN; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY; -import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; -import static eu.dnetlib.dhp.schema.common.ModelConstants.MERGES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET; -import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP; -import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; -import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; -import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; -import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import java.io.Closeable; @@ -45,6 +20,8 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.Lists; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; @@ -68,6 +45,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.utils.ISLookupClientFactory; public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { @@ -437,25 +415,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List collectedFrom = listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - final Relation r1 = new Relation(); - r1.setRelType(DATASOURCE_ORGANIZATION); - r1.setSubRelType(PROVISION); - r1.setRelClass(IS_PROVIDED_BY); - r1.setSource(dsId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + final Relation r1 = OafMapperUtils + .getRelation( + dsId, orgId, DATASOURCE_ORGANIZATION, PRODUCES, IS_PROVIDED_BY, collectedFrom, info, + lastUpdateTimestamp); - final Relation r2 = new Relation(); - r2.setRelType(DATASOURCE_ORGANIZATION); - r2.setSubRelType(PROVISION); - r2.setRelClass(PROVIDES); - r2.setSource(orgId); - r2.setTarget(dsId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); + final Relation r2 = OafMapperUtils + .getRelation( + orgId, dsId, DATASOURCE_ORGANIZATION, PRODUCES, PROVIDES, collectedFrom, info, lastUpdateTimestamp); return Arrays.asList(r1, r2); } catch (final Exception e) { @@ -471,25 +438,20 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List collectedFrom = listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - final Relation r1 = new Relation(); - r1.setRelType(PROJECT_ORGANIZATION); - r1.setSubRelType(PARTICIPATION); - r1.setRelClass(HAS_PARTICIPANT); - r1.setSource(projectId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + final List properties = Lists + .newArrayList( + keyValue("contribution", String.valueOf(rs.getDouble("totalcost"))), + keyValue("currency", rs.getString("currency"))); - final Relation r2 = new Relation(); - r2.setRelType(PROJECT_ORGANIZATION); - r2.setSubRelType(PARTICIPATION); - r2.setRelClass(IS_PARTICIPANT); - r2.setSource(orgId); - r2.setTarget(projectId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); + final Relation r1 = OafMapperUtils + .getRelation( + projectId, orgId, PROJECT_ORGANIZATION, PARTICIPATION, HAS_PARTICIPANT, collectedFrom, info, + lastUpdateTimestamp, null, properties); + + final Relation r2 = OafMapperUtils + .getRelation( + orgId, projectId, PROJECT_ORGANIZATION, PARTICIPATION, IS_PARTICIPANT, collectedFrom, info, + lastUpdateTimestamp, null, properties); return Arrays.asList(r1, r2); } catch (final Exception e) { @@ -703,25 +665,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List collectedFrom = listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - final Relation r1 = new Relation(); - r1.setRelType(ORG_ORG_RELTYPE); - r1.setSubRelType(ModelConstants.DEDUP); - r1.setRelClass(MERGES); - r1.setSource(orgId1); - r1.setTarget(orgId2); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); + final Relation r1 = OafMapperUtils + .getRelation(orgId1, orgId2, ORG_ORG_RELTYPE, DEDUP, MERGES, collectedFrom, info, lastUpdateTimestamp); - final Relation r2 = new Relation(); - r2.setRelType(ORG_ORG_RELTYPE); - r2.setSubRelType(ModelConstants.DEDUP); - r2.setRelClass(IS_MERGED_IN); - r2.setSource(orgId2); - r2.setTarget(orgId1); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); + final Relation r2 = OafMapperUtils + .getRelation( + orgId2, orgId1, ORG_ORG_RELTYPE, DEDUP, IS_MERGED_IN, collectedFrom, info, lastUpdateTimestamp); return Arrays.asList(r1, r2); } catch (final Exception e) { throw new RuntimeException(e); @@ -738,17 +687,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List collectedFrom = listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - final Relation r = new Relation(); - r.setRelType(ORG_ORG_RELTYPE); - r.setSubRelType(ModelConstants.RELATIONSHIP); - r.setRelClass(rs.getString("type")); - r.setSource(orgId1); - r.setTarget(orgId2); - r.setCollectedfrom(collectedFrom); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r); + return Arrays + .asList( + OafMapperUtils + .getRelation( + orgId1, orgId2, ORG_ORG_RELTYPE, RELATIONSHIP, rs.getString("type"), collectedFrom, info, + lastUpdateTimestamp)); } catch (final Exception e) { throw new RuntimeException(e); } @@ -765,29 +709,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List collectedFrom = listKeyValues( createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - final Relation r1 = new Relation(); - r1.setRelType(ORG_ORG_RELTYPE); - r1.setSubRelType(ModelConstants.DEDUP); - r1.setRelClass(relClass); - r1.setSource(orgId1); - r1.setTarget(orgId2); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - // removed because there's no difference between two sides //TODO - // final Relation r2 = new Relation(); - // r2.setRelType(ORG_ORG_RELTYPE); - // r2.setSubRelType(ORG_ORG_SUBRELTYPE); - // r2.setRelClass(relClass); - // r2.setSource(orgId2); - // r2.setTarget(orgId1); - // r2.setCollectedfrom(collectedFrom); - // r2.setDataInfo(info); - // r2.setLastupdatetimestamp(lastUpdateTimestamp); - // return Arrays.asList(r1, r2); - - return Arrays.asList(r1); + return Arrays + .asList( + OafMapperUtils + .getRelation( + orgId1, orgId2, ORG_ORG_RELTYPE, DEDUP, relClass, collectedFrom, info, + lastUpdateTimestamp)); } catch (final Exception e) { throw new RuntimeException(e); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjectOrganization.sql index d9a77427db..9a5133e4bd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjectOrganization.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryProjectOrganization.sql @@ -3,6 +3,7 @@ SELECT po.resporganization AS resporganization, po.participantnumber AS participantnumber, po.contribution AS contribution, + po.currency AS currency, NULL AS startdate, NULL AS enddate, false AS inferred, From 0a4f4d98fa8f73d296ef92f4dbf9f12a760ef735 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 13 Jul 2022 15:27:17 +0200 Subject: [PATCH 02/32] added PMCId to PmArticle --- .../dnetlib/dhp/sx/bio/pubmed/PMArticle.java | 24 +- .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala | 1 + .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 48 +++- .../eu/dnetlib/dhp/sx/graph/bio/pubmed.xml | 257 ++++++++++++++++-- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 71 ++++- 5 files changed, 371 insertions(+), 30 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index af0d5169d3..9287a8cdd6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -17,6 +17,9 @@ public class PMArticle implements Serializable { * the Pubmed Identifier */ private String pmid; + + private String pmcId; + /** * the DOI */ @@ -122,7 +125,7 @@ public class PMArticle implements Serializable { /** * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. - * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. + * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. * The NLM journal title abbreviation is exported in the element. * * @return the pubmed Journal Extracted @@ -140,10 +143,11 @@ public class PMArticle implements Serializable { } /** - * English-language abstracts are taken directly from the published article. - * If the article does not have a published abstract, the National Library of Medicine does not create one, - * thus the record lacks the and elements. However, in the absence of a formally - * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * contains the entire title of the journal article. is always in English; + * those titles originally published in a non-English language and translated for are enclosed in square brackets. + * All titles end with a period unless another punctuation mark such as a question mark or bracket is present. + * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl). + * Corporate/collective authors may appear at the end of for citations up to about the year 2000. * * @return the extracted pubmed Title */ @@ -250,4 +254,14 @@ public class PMArticle implements Serializable { public List getGrants() { return grants; } + + + public String getPmcId() { + return pmcId; + } + + public PMArticle setPmcId(String pmcId) { + this.pmcId = pmcId; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index 49a2716415..9102c12c43 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -98,6 +98,7 @@ class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] { case "PMID" => currentArticle.setPmid(text.trim) case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) + if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim) case "Language" => currentArticle.setLanguage(text.trim) case "ISSN" => currentJournal.setIssn(text.trim) case "GrantID" => currentGrant.setGrantID(text.trim) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 92ad22c573..24a1fa62b9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -4,9 +4,12 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ -import collection.JavaConverters._ +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import collection.JavaConverters._ import java.util.regex.Pattern +import scala.collection.mutable.ListBuffer /** */ @@ -14,6 +17,9 @@ object PubMedToOaf { val SUBJ_CLASS = "keywords" + val OAI_HEADER = "oai:pubmedcentral.nih.gov:" + val OLD_PMC_PREFIX = "od_______267::" + val urlMap = Map( "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "doi" -> "https://dx.doi.org/" @@ -50,6 +56,17 @@ object PubMedToOaf { null } + + def createOriginalOpenaireId(article:PMArticle) :String = { + if (StringUtils.isNotEmpty(article.getPmcId)) { + val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC","")}") + s"$OLD_PMC_PREFIX$md5" + } + else + null + + } + /** Create an instance of class extends Result * starting from OAF instanceType value * @@ -122,8 +139,9 @@ object PubMedToOaf { return null // MAP PMID into pid with classid = classname = pmid - val pidList: List[StructuredProperty] = List( - OafMapperUtils.structuredProperty( + val pidList = ListBuffer[StructuredProperty]() + + pidList += OafMapperUtils.structuredProperty( article.getPmid, PidType.pmid.toString, PidType.pmid.toString, @@ -131,7 +149,19 @@ object PubMedToOaf { ModelConstants.DNET_PID_TYPES, dataInfo ) - ) + + + if (StringUtils.isNotBlank(article.getPmcId)) + { + pidList += OafMapperUtils.structuredProperty( + article.getPmcId, + PidType.pmc.toString, + PidType.pmc.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + } if (pidList == null) return null @@ -186,6 +216,7 @@ object PubMedToOaf { val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .filter(t => t._1.nonEmpty) + .toList .map(t => t._1 + t._2) if (urlLists != null) pubmedInstance.setUrl(urlLists.asJava) @@ -262,7 +293,14 @@ object PubMedToOaf { if (authors != null && authors.nonEmpty) result.setAuthor(authors.asJava) - result.setOriginalId(pidList.map(s => s.getValue).asJava) + + if (StringUtils.isNotEmpty(article.getPmcId)) { + val originalIDS = ListBuffer[String]() + originalIDS += createOriginalOpenaireId(article) + pidList.map(s => s.getValue).foreach(p =>originalIDS += p) + result.setOriginalId(originalIDS.asJava) + } else + result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setId(article.getPmid) diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml index 22da07e299..58a73ae5d3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml @@ -195,7 +195,9 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution. + Delineation of the intimate details of the backbone conformation of pyridine nucleotide + coenzymes in aqueous solution. + 1173-9 @@ -473,7 +475,9 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and inhibition of their uptake. + Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and + inhibition of their uptake. + 1338-43 @@ -657,7 +661,8 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin. + Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin. + 1349-56 @@ -1627,7 +1632,9 @@ Biochemical pharmacology Biochem Pharmacol - Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal amidase-esterase. + Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal + amidase-esterase. + 1517-21 @@ -2030,7 +2037,9 @@ Biochemical pharmacology Biochem Pharmacol - Radiochemical assay of glutathione S-epoxide transferase and its enhancement by phenobarbital in rat liver in vivo. + Radiochemical assay of glutathione S-epoxide transferase and its enhancement by + phenobarbital in rat liver in vivo. + 1569-72 @@ -2350,7 +2359,9 @@ Biochemical pharmacology Biochem Pharmacol - Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled beta-adrenergic antagonists. + Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled + beta-adrenergic antagonists. + 1651-8 @@ -2598,7 +2609,9 @@ Biochemical pharmacology Biochem Pharmacol - The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP concentration and on incorporation of 32Pi into ATP in rat fat cells. + The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP + concentration and on incorporation of 32Pi into ATP in rat fat cells. + 1659-62 @@ -2851,7 +2864,9 @@ Biochemical pharmacology Biochem Pharmacol - Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the presence of valinomycin. + Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the + presence of valinomycin. + 1701-5 @@ -3265,7 +3280,8 @@ EC 2.6.1.16 - Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + EC 2.7.- @@ -3324,7 +3340,9 @@ Glucosamine - Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + Glutamine-Fructose-6-Phosphate Transaminase + (Isomerizing) + metabolism @@ -3463,7 +3481,8 @@ Biochemical pharmacology Biochem Pharmacol - Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines. + Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines. + 1731-3 @@ -3696,7 +3715,9 @@ Biochemical pharmacology Biochem Pharmacol - Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central catecholaminergic neurons of the rat. + Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central + catecholaminergic neurons of the rat. + 1739-42 @@ -4602,12 +4623,19 @@ Arzneimittel-Forschung Arzneimittelforschung - [Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)]. + [Biochemical studies on camomile components/III. In vitro studies about the antipeptic + activity of (--)-alpha-bisabolol (author's transl)]. + 1352-4 - (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost. + (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not + caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 + percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol + only occurs in case of direct contact. In case of a previous contact with the substrate, the + inhibiting effect is lost. + @@ -4626,7 +4654,9 @@ English Abstract Journal Article - Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die antipeptische Wirkung des (-)-alpha-Bisabolols + Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die + antipeptische Wirkung des (-)-alpha-Bisabolols + Germany @@ -4753,12 +4783,37 @@ Arzneimittel-Forschung Arzneimittelforschung - [Demonstration of tumor inhibiting properties of a strongly immunostimulating low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a N-(2-cyanoethylene)-urea. Novel prophylactic possibilities]. + [Demonstration of tumor inhibiting properties of a strongly immunostimulating + low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS + carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a + N-(2-cyanoethylene)-urea. Novel prophylactic possibilities]. + 1369-79 - A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent. (Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.) However this comparison is characterized by the fact that both substances exhibit two quite different (complementary) mechanisms of action. Leucocyte counts made after application of the said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with ifosfamide, the known suppression in the post-therapeutic interval usually found with standard cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow conclusions on the immunity status. Since IF can be taken as one of the most efficient cancerostatics--there is no other chemotherapeutic known up to now that has a more significant effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally, the total amount of leucocytes and lymphocytes as well as their time behaviour was determined from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained numerical values clearly show that further research work on the prophylactic use of this substance seems to be necessary and very promising. + A report is given on the recent discovery of outstanding immunological properties in + BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS + carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent + LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 + percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under + otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development + of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent + LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent. + (Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.) + However this comparison is characterized by the fact that both substances exhibit two quite + different (complementary) mechanisms of action. Leucocyte counts made after application of the + said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with + ifosfamide, the known suppression in the post-therapeutic interval usually found with standard + cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow + conclusions on the immunity status. Since IF can be taken as one of the most efficient + cancerostatics--there is no other chemotherapeutic known up to now that has a more significant + effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally, + the total amount of leucocytes and lymphocytes as well as their time behaviour was determined + from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained + numerical values clearly show that further research work on the prophylactic use of this + substance seems to be necessary and very promising. + @@ -4778,7 +4833,11 @@ English Abstract Journal Article - Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid. Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten + Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung + kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid. + Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen + N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten + Germany @@ -5016,7 +5075,20 @@ 1400-3 - The distribution of blood flow to the subendocardial, medium and subepicardial layers of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic (B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C). Regional myocardial blood flow was determined by means of the particle distribution method. In normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a significantly higher flow rate compared with the subepicardial layers. In hypoxia induced vasodilation this transmural gradient of flow was persistent. In contrast a marked redistribution of regional flow was observed under pharmacologically induced vasodilation. The transmural gradient decreased. In contrast to some findings these experiments demonstrate that a considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by drugs. The differences observed for the intramural distribution pattern of flow under hypoxia and drug induced vasodilation support the hypothesis that this pattern reflects corresponding gradients of regional myocardial metabolism. + The distribution of blood flow to the subendocardial, medium and subepicardial layers + of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic + (B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C). + Regional myocardial blood flow was determined by means of the particle distribution method. In + normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a + significantly higher flow rate compared with the subepicardial layers. In hypoxia induced + vasodilation this transmural gradient of flow was persistent. In contrast a marked + redistribution of regional flow was observed under pharmacologically induced vasodilation. The + transmural gradient decreased. In contrast to some findings these experiments demonstrate that a + considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by + drugs. The differences observed for the intramural distribution pattern of flow under hypoxia + and drug induced vasodilation support the hypothesis that this pattern reflects corresponding + gradients of regional myocardial metabolism. + @@ -5185,4 +5257,151 @@ + + + 4917185 + + 1970 + 10 + 27 + + + 2018 + 11 + 13 + +
+ + 0003-6919 + + 19 + 6 + + 1970 + Jun + + + Applied microbiology + Appl Microbiol + + Bactericidal activity of a broad-spectrum illumination source. + + 1013-4 + + + + Several hours of exposure to Vita-Lite lamps, which have a unique spectral + distribution, give significant killing of cells of Staphylococcus aureus. + + + + + Himmelfarb + P + P + + + Scott + A + A + + + Thayer + P S + PS + + + eng + + Journal Article + +
+ + United States + Appl Microbiol + 7605802 + 0003-6919 + + IM + + + Bacteriological Techniques + instrumentation + + + Light + + + Radiation Effects + + + Serratia marcescens + growth & development + radiation effects + + + Staphylococcus + growth & development + radiation effects + + + Sterilization + + +
+ + + + 1970 + 6 + 1 + + + 1970 + 6 + 1 + 0 + 1 + + + 1970 + 6 + 1 + 0 + 0 + + + ppublish + + 4917185 + PMC376844 + + + + Photochem Photobiol. 1969 Jan;9(1):99-102 + + 4889809 + + + + Endocrinology. 1969 Dec;85(6):1218-21 + + 5347623 + + + + Arch Mikrobiol. 1956;24(1):60-79 + + 13327987 + + + + J Bacteriol. 1941 Sep;42(3):353-66 + + 16560457 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index ea742a04aa..37a5808ec9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -2,7 +2,8 @@ package eu.dnetlib.dhp.sx.bio import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest -import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} +import eu.dnetlib.dhp.schema.oaf.utils.PidType +import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf} import org.json4s.DefaultFormats @@ -16,6 +17,7 @@ import org.mockito.junit.jupiter.MockitoExtension import java.io.{BufferedReader, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer import scala.io.Source import scala.xml.pull.XMLEventReader @@ -72,6 +74,73 @@ class BioScholixTest extends AbstractVocabularyTest { ) println(mapper.writeValueAsString(r.head)) + } + + + private def checkPMArticle(article:PMArticle): Unit = { + assertNotNull(article.getPmid) + assertNotNull(article.getTitle) + assertNotNull(article.getAuthors) + article.getAuthors.asScala.foreach{a => + assertNotNull(a) + assertNotNull(a.getFullName) + } + + } + + @Test + def testParsingPubmedXML():Unit = { + val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + val parser = new PMParser(xml) + parser.foreach(checkPMArticle) + } + + + private def checkPubmedPublication(o:Oaf): Unit = { + assertTrue(o.isInstanceOf[Publication]) + val p:Publication = o.asInstanceOf[Publication] + assertNotNull(p.getId) + assertNotNull(p.getTitle) + p.getTitle.asScala.foreach(t =>assertNotNull(t.getValue)) + p.getAuthor.asScala.foreach(a =>assertNotNull(a.getFullname)) + assertNotNull(p.getInstance()) + p.getInstance().asScala.foreach { i => + assertNotNull(i.getCollectedfrom) + assertNotNull(i.getPid) + assertNotNull(i.getInstancetype) + } + assertNotNull(p.getOriginalId) + p.getOriginalId.asScala.foreach(oId => assertNotNull(oId)) + + + val hasPMC = p.getInstance().asScala.exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString))) + + + + if (hasPMC) { + assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::"))) + + + } + + + + } + @Test + def testPubmedMapping() :Unit = { + + val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + val parser = new PMParser(xml) + val results = ListBuffer[Oaf]() + parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) + + + + + results.foreach(checkPubmedPublication) + + + } @Test From 00168303dbb4e14ee453a911d3c072314e686864 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 14 Jul 2022 10:19:59 +0200 Subject: [PATCH 03/32] Added unit test to verify the generation in the OriginalID the old openaire Identifier generated by OAI --- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 37a5808ec9..b021e5e078 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -5,7 +5,7 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf} import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -119,13 +119,42 @@ class BioScholixTest extends AbstractVocabularyTest { if (hasPMC) { assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::"))) - - } - - - } + + + @Test + def testPubmedOriginalID():Unit = { + val article:PMArticle = new PMArticle + + + article.setPmid("1234") + + article.setTitle("a Title") + + // VERIFY PUBLICATION IS NOT NULL + article.getPublicationTypes.add( new PMSubject("article",null, null)) + var publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication] + assertNotNull(publication) + assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId) + + // VERIFY PUBLICATION ID DOES NOT CHANGE ALSO IF SETTING PMC IDENTIFIER + article.setPmcId("PMC1517292") + publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication] + assertNotNull(publication) + assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId) + + // VERIFY ORIGINAL ID GENERATE IN OLD WAY USING PMC IDENTIFIER EXISTS + + + val oldOpenaireID ="od_______267::0000072375bc0e68fa09d4e6b7658248" + + val hasOldOpenAIREID = publication.getOriginalId.asScala.exists(o => o.equalsIgnoreCase(oldOpenaireID)) + + assertTrue(hasOldOpenAIREID) + } + + @Test def testPubmedMapping() :Unit = { From 5b76321d9cb58f68a52ee01d14ea77f38e81d749 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 20 Jul 2022 16:34:32 +0200 Subject: [PATCH 04/32] implemented oozie workflow to generate scholix dump filtering relclass semantic --- .../sx/graph/convert_dataset_json_params.json | 1 + .../sx/graph/convert_object_json_params.json | 4 +- .../dumpScholix/oozie_app/config-default.xml | 10 ++ .../graph/dumpScholix/oozie_app/workflow.xml | 145 ++++++++++++++++++ .../sx/graph/SparkConvertObjectToJson.scala | 33 +++- .../sx/graph/SparkConvertRDDtoDataset.scala | 61 +++++--- 6 files changed, 228 insertions(+), 26 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json index 8bfdde5b0d..f3e8cdbade 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json @@ -2,4 +2,5 @@ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true} + {"paramName":"r", "paramLongName":"filterRelation", "paramDescription": "the relation to filter", "paramRequired": false} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json index 890570a0bd..cbb20bfe77 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_object_json_params.json @@ -3,5 +3,7 @@ {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"su", "paramLongName":"scholixUpdatePath", "paramDescription": "the scholix updated Path", "paramRequired": false}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}, - {"paramName":"o", "paramLongName":"objectType", "paramDescription": "should be scholix or Summary", "paramRequired": true} + {"paramName":"o", "paramLongName":"objectType", "paramDescription": "should be scholix or Summary", "paramRequired": true}, + {"paramName":"mp", "paramLongName":"maxPidNumberFilter", "paramDescription": "filter max number of pids in source/target", "paramRequired": false} + ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml new file mode 100644 index 0000000000..6fb2a1253c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/config-default.xml @@ -0,0 +1,10 @@ + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml new file mode 100644 index 0000000000..a37d85ad41 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/dumpScholix/oozie_app/workflow.xml @@ -0,0 +1,145 @@ + + + + sourcePath + the working dir base path + + + targetPath + the final graph path + + + relationFilter + Filter relation semantic + + + maxNumberOfPid + filter relation with at least #maxNumberOfPid + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Import JSONRDD to Dataset kryo + eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=3000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --sourcePath${sourcePath} + --targetPath${targetPath} + --filterRelation${relationFilter} + + + + + + + + + yarn + cluster + Convert Entities to summaries + eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=20000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --sourcePath${targetPath}/entities + --targetPath${targetPath}/provision/summaries + + + + + + + + yarn + cluster + Generate Scholix Dataset + eu.dnetlib.dhp.sx.graph.SparkCreateScholix + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=30000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --summaryPath${targetPath}/provision/summaries + --targetPath${targetPath}/provision/scholix + --relationPath${targetPath}/relation + + + + + + + + + + + + + + + + + yarn + cluster + Serialize scholix to JSON + eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=6000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --sourcePath${targetPath}/provision/scholix/scholix + --targetPath${targetPath}/json/scholix_json + --objectTypescholix + --maxPidNumberFiltermaxNumberOfPid + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index bfa07eb69e..6695ebd3c8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary +import eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson.toInt import org.apache.commons.io.IOUtils import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf @@ -12,6 +13,14 @@ import org.slf4j.{Logger, LoggerFactory} object SparkConvertObjectToJson { + def toInt(s: String): Option[Int] = { + try { + Some(s.toInt) + } catch { + case e: Exception => None + } + } + def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) val conf: SparkConf = new SparkConf() @@ -37,6 +46,8 @@ object SparkConvertObjectToJson { log.info(s"objectType -> $objectType") val scholixUpdatePath = parser.get("scholixUpdatePath") log.info(s"scholixUpdatePath -> $scholixUpdatePath") + val maxPidNumberFilter = parser.get("maxPidNumberFilter") + log.info(s"maxPidNumberFilter -> $maxPidNumberFilter") implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] @@ -47,12 +58,22 @@ object SparkConvertObjectToJson { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] - val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] - d.union(u) - .repartition(8000) - .map(s => mapper.writeValueAsString(s))(Encoders.STRING) - .rdd - .saveAsTextFile(targetPath, classOf[GzipCodec]) +// val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] + if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) { + val mp = toInt(maxPidNumberFilter).get + d + .filter(s => (s.getSource.getIdentifier.size() <= mp) && (s.getTarget.getIdentifier.size() <= mp)) + .map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .rdd + .saveAsTextFile(targetPath, classOf[GzipCodec]) + } else { + d + .repartition(8000) + .map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .rdd + .saveAsTextFile(targetPath, classOf[GzipCodec]) + } + case "summary" => log.info("Serialize Summary") val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index bd970a5cf4..f72f8dd160 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -4,9 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils +import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + import scala.collection.JavaConverters._ object SparkConvertRDDtoDataset { @@ -34,6 +36,9 @@ object SparkConvertRDDtoDataset { val t = parser.get("targetPath") log.info(s"targetPath -> $t") + val filterRelation = parser.get("filterRelation") + log.info(s"filterRelation -> $filterRelation") + val entityPath = s"$t/entities" val relPath = s"$t/relation" val mapper = new ObjectMapper() @@ -94,28 +99,46 @@ object SparkConvertRDDtoDataset { log.info("Converting Relation") - val relationSemanticFilter = List( -// "cites", -// "iscitedby", - "merges", - "ismergedin", - "HasAmongTopNSimilarDocuments", - "IsAmongTopNSimilarDocuments" - ) + if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) { - val rddRelation = spark.sparkContext - .textFile(s"$sourcePath/relation") - .map(s => mapper.readValue(s, classOf[Relation])) - .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) - .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) - //filter OpenCitations relations - .filter(r => - r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => - "opencitations".equalsIgnoreCase(k.getValue) + val rddRelation = spark.sparkContext + .textFile(s"$sourcePath/relation") + .map(s => mapper.readValue(s, classOf[Relation])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + //filter OpenCitations relations + .filter(r => + r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => + "opencitations".equalsIgnoreCase(k.getValue) + ) ) + .filter(r => r.getRelClass != null && r.getRelClass.equalsIgnoreCase(filterRelation)) + spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") + } else { + + val relationSemanticFilter = List( + // "cites", + // "iscitedby", + "merges", + "ismergedin", + "HasAmongTopNSimilarDocuments", + "IsAmongTopNSimilarDocuments" ) - .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) - spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") + + val rddRelation = spark.sparkContext + .textFile(s"$sourcePath/relation") + .map(s => mapper.readValue(s, classOf[Relation])) + .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) + .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + //filter OpenCitations relations + .filter(r => + r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => + "opencitations".equalsIgnoreCase(k.getValue) + ) + ) + .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) + spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") + } } } From 438abdf96fcdc340dfdbe18fa51c56c8a3ce5657 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 20 Jul 2022 18:07:54 +0200 Subject: [PATCH 05/32] [EOSC TAG] adding eosc interoperability guidelines in the specific element in the result. Removed from subjects. Removed also the deletion of EOSC Jupyter Notebook from subject since now the criteria are searchd for in a different place --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 82 ++++++------------- pom.xml | 2 +- 2 files changed, 27 insertions(+), 57 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index b9de5dd111..d319340812 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -28,28 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final Qualifier EOSC_QUALIFIER = OafMapperUtils - .qualifier( - "EOSC", - "European Open Science Cloud", - ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); - public static final DataInfo EOSC_DATAINFO = OafMapperUtils - .dataInfo( - false, "propagation", true, false, - OafMapperUtils - .qualifier( - "propagation:subject", "Inferred by OpenAIRE", - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.9"); - public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils - .structuredProperty( - "EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_GALAXY = OafMapperUtils - .structuredProperty( - "EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_TWITTER = OafMapperUtils - .structuredProperty( - "EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO); + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -84,29 +63,30 @@ public class SparkEoscTag { }); } + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ + EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode( code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semantics); + return eig; + + } private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) .map((MapFunction) s -> { - List sbject; - if (!Optional.ofNullable(s.getSubject()).isPresent()) - s.setSubject(new ArrayList<>()); - sbject = s.getSubject(); if (containsCriteriaNotebook(s)) { - sbject.add(EOSC_NOTEBOOK); - if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { - sbject = sbject.stream().map(sb -> { - if (sb.getValue().equals("EOSC Jupyter Notebook")) { - return null; - } - return sb; - }).filter(Objects::nonNull).collect(Collectors.toList()); - s.setSubject(sbject); - } + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + s.getEoscifguidelines().add(newInstance("EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith")); } if (containsCriteriaGalaxy(s)) { - sbject.add(EOSC_GALAXY); + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + + s.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); } return s; }, Encoders.bean(Software.class)) @@ -124,14 +104,14 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { List sbject; - if (!Optional.ofNullable(orp.getSubject()).isPresent()) - orp.setSubject(new ArrayList<>()); - sbject = orp.getSubject(); + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) + orp.setEoscifguidelines(new ArrayList<>()); + if (containsCriteriaGalaxy(orp)) { - sbject.add(EOSC_GALAXY); + orp.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); } if (containscriteriaTwitter(orp)) { - sbject.add(EOSC_TWITTER); + orp.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -149,11 +129,10 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { List sbject; - if (!Optional.ofNullable(d.getSubject()).isPresent()) - d.setSubject(new ArrayList<>()); - sbject = d.getSubject(); + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) + d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - sbject.add(EOSC_TWITTER); + d.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); } return d; }, Encoders.bean(Dataset.class)) @@ -212,13 +191,6 @@ public class SparkEoscTag { return false; } - private static Set getSubjects(List s) { - Set subjects = new HashSet<>(); - s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" ")))); - s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase())); - return subjects; - } - private static Set getWordsSP(List elem) { Set words = new HashSet<>(); Optional @@ -242,9 +214,7 @@ public class SparkEoscTag { t -> words .addAll( Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); -// elem -// .forEach( -// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))); + return words; } diff --git a/pom.xml b/pom.xml index 54070f654c..973bc37732 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.0] + [2.12.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6] From 5f651f231650d0e2547bfdf9f7969611e5d43d85 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 21 Jul 2022 10:11:48 +0200 Subject: [PATCH 06/32] changed filter relation on SubRelType --- .../eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index f72f8dd160..0073afff5e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -112,13 +112,11 @@ object SparkConvertRDDtoDataset { "opencitations".equalsIgnoreCase(k.getValue) ) ) - .filter(r => r.getRelClass != null && r.getRelClass.equalsIgnoreCase(filterRelation)) + .filter(r => r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation)) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") } else { val relationSemanticFilter = List( - // "cites", - // "iscitedby", "merges", "ismergedin", "HasAmongTopNSimilarDocuments", From 5143a802320b3a64a16b09f298f005d4fbce78e9 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 11:56:51 +0200 Subject: [PATCH 07/32] [EOSC TAG] modification of test class to align with new element --- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 174 ++++++++++++++++-- 1 file changed, 155 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 1ea2541576..b1c0cbb84a 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,8 +126,9 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp + .filter(s -> s.getEoscifguidelines()!= null) .filter( - s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); Assertions @@ -136,17 +137,36 @@ public class EOSCTagJobTest { .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) - .getSubject() + .getEoscifguidelines() .size()); + + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .size()); Assertions .assertTrue( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) - .getSubject() + .getEoscifguidelines() .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + + Assertions + .assertFalse( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -166,16 +186,23 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null + ); + Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() @@ -183,6 +210,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -201,17 +245,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null + ); Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() @@ -219,14 +270,27 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(1, + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .collect() .get(0) .getSubject(); - Assertions.assertEquals(8, subjects.size()); - Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(7, subjects.size()); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire"))); @@ -250,6 +314,14 @@ public class EOSCTagJobTest { .filter( ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -264,9 +336,18 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); + // spark.stop(); } @@ -326,22 +407,37 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, tmp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines()!=null) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines()!=null) + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( - 2, tmp + 1, tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -350,6 +446,19 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions.assertEquals(1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size() ); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions .assertEquals( 5, tmp @@ -385,22 +494,26 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, orp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); + + Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) + .filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); Assertions .assertEquals( - 3, orp + 2, orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() @@ -408,6 +521,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -516,10 +646,16 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 3, + 0, orp .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); + Assertions + .assertEquals( + 3, + orp + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -531,7 +667,7 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); } From 56d09e6348d63d531250ea6f075c1d306c07ed66 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 14:36:48 +0200 Subject: [PATCH 08/32] [EOSC TAG] before adding the tag added a step to verify the same tag is not already present --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index d319340812..c5ed0b45ce 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -80,13 +80,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - s.getEoscifguidelines().add(newInstance("EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith")); + addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); + } if (containsCriteriaGalaxy(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - s.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); + addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -103,15 +104,15 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { - List sbject; + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) orp.setEoscifguidelines(new ArrayList<>()); if (containsCriteriaGalaxy(orp)) { - orp.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); + addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } if (containscriteriaTwitter(orp)) { - orp.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); + addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -128,11 +129,11 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { - List sbject; + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - d.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); + addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -148,6 +149,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } + private static void addEIG(List eoscifguidelines, String code, String label, String url, String sem) { + if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) + eoscifguidelines.add(newInstance(code, label, url, sem)); + } + + private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); From 3be036f290f3cb569465cf9566557e51c4660288 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 14:45:43 +0200 Subject: [PATCH 09/32] [EOSC TAG] refactoring after compilation --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 25 +- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 284 ++++++++++-------- 2 files changed, 178 insertions(+), 131 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index c5ed0b45ce..730e8a3fe7 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -29,7 +29,6 @@ public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -63,15 +62,16 @@ public class SparkEoscTag { }); } - public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) { EoscIfGuidelines eig = new EoscIfGuidelines(); - eig.setCode( code); + eig.setCode(code); eig.setLabel(label); eig.setUrl(url); eig.setSemanticRelation(semantics); return eig; } + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) @@ -80,14 +80,17 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", + "compliesWith"); } if (containsCriteriaGalaxy(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -109,10 +112,12 @@ public class SparkEoscTag { orp.setEoscifguidelines(new ArrayList<>()); if (containsCriteriaGalaxy(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", + "compliesWith"); } if (containscriteriaTwitter(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -133,7 +138,7 @@ public class SparkEoscTag { if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -149,12 +154,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } - private static void addEIG(List eoscifguidelines, String code, String label, String url, String sem) { + private static void addEIG(List eoscifguidelines, String code, String label, String url, + String sem) { if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) eoscifguidelines.add(newInstance(code, label, url, sem)); } - private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index b1c0cbb84a..5f47da10e5 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,9 +126,12 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp - .filter(s -> s.getEoscifguidelines()!= null) + .filter(s -> s.getEoscifguidelines() != null) .filter( - s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); Assertions @@ -141,13 +144,13 @@ public class EOSCTagJobTest { .size()); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .size()); Assertions .assertTrue( tmp @@ -159,14 +162,14 @@ public class EOSCTagJobTest { .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions - .assertFalse( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + .assertFalse( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -186,12 +189,13 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -211,22 +215,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -245,13 +249,13 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue( + Assertions + .assertTrue( tmp - .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -270,20 +274,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertEquals(1, + Assertions + .assertEquals( + 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) @@ -315,13 +323,16 @@ public class EOSCTagJobTest { ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) - .filter( - ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -336,17 +347,23 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp + .getSubject() + .stream() + .anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/otherresearchproduct") - .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) - .filter( - orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp + .getSubject() + .stream() + .anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); // spark.stop(); } @@ -413,20 +430,24 @@ public class EOSCTagJobTest { s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -446,18 +467,23 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); - Assertions.assertEquals(1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .size() ); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -499,10 +525,18 @@ public class EOSCTagJobTest { .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); - orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); + orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); - Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) - .filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); + Assertions + .assertEquals( + 1, orp + .filter(o -> o.getEoscifguidelines() != null) + .filter( + o -> o + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -522,22 +556,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); Assertions - .assertEquals( - 1, orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -651,11 +685,15 @@ public class EOSCTagJobTest { .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); Assertions - .assertEquals( - 3, - orp - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) - .count()); + .assertEquals( + 3, + orp + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -667,7 +705,11 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); } From 317a4a56ef9019bd4cc3280673ec34cc8c5b58d0 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 17:37:48 +0200 Subject: [PATCH 10/32] [EOSC context TAG] first implementation of the logic to tag results imported from datasources registered in the EOSC --- .../dhp/bulktag/eosc/DatasourceMaster.java | 28 ++++ .../eosc/ReadMasterDatasourceFromDB.java | 134 ++++++++++++++++ .../dhp/bulktag/eosc/SparkEoscBulkTag.java | 145 ++++++++++++++++++ pom.xml | 2 +- 4 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java new file mode 100644 index 0000000000..537356586d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.bulktag.eosc; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 21/07/22 + */ +public class DatasourceMaster implements Serializable { + private String datasource; + private String master; + + public String getDatasource() { + return datasource; + } + + public void setDatasource(String datasource) { + this.datasource = datasource; + } + + public String getMaster() { + return master; + } + + public void setMaster(String master) { + this.master = master; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java new file mode 100644 index 0000000000..7d6964707d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java @@ -0,0 +1,134 @@ + +package eu.dnetlib.dhp.bulktag.eosc; + +/** + * @author miriam.baglioni + * @Date 21/07/22 + */ + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; + +public class ReadMasterDatasourceFromDB implements Closeable { + + private final DbClient dbClient; + private static final Log log = LogFactory.getLog(ReadMasterDatasourceFromDB.class); + + private final BufferedWriter writer; + private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final String QUERY = "SELECT dso.id datasource, d.id master FROM " + + "(SELECT id FROM dsm_services WHERE id like 'eosc%') dso " + + "FULL JOIN " + + "(SELECT id, duplicate FROM dsm_dedup_services WHERE duplicate like 'eosc%')d " + + "ON dso.id = d.duplicate"; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ReadMasterDatasourceFromDB.class + .getResourceAsStream( + "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + final String hdfsPath = parser.get("hdfsPath") + "/datasourceMasters"; + final String hdfsNameNode = parser.get("hdfsNameNode"); + + try (final ReadMasterDatasourceFromDB rmd = new ReadMasterDatasourceFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, + dbPassword)) { + + log.info("Processing datasources..."); + rmd.execute(QUERY, rmd::datasourceMasterMap); + + } + } + + public void execute(final String sql, final Function producer) { + + dbClient.processResults(sql, rs -> writeMap(producer.apply(rs))); + } + + public DatasourceMaster datasourceMasterMap(ResultSet rs) { + try { + DatasourceMaster dm = new DatasourceMaster(); + String datasource = rs.getString("datasource"); + dm.setDatasource(datasource); + String master = rs.getString("master"); + if (StringUtils.isNotBlank(master)) + dm.setMaster(OafMapperUtils.createOpenaireId(10, master, true)); + else + dm.setMaster(OafMapperUtils.createOpenaireId(10, datasource, true)); + return dm; + + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws IOException { + dbClient.close(); + writer.close(); + } + + public ReadMasterDatasourceFromDB( + final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword) + throws IOException { + + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fsDataOutputStream = fileSystem.append(hdfsWritePath); + } else { + fsDataOutputStream = fileSystem.create(hdfsWritePath); + } + + this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + } + + protected void writeMap(final DatasourceMaster dm) { + try { + writer.write(OBJECT_MAPPER.writeValueAsString(dm)); + writer.newLine(); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java new file mode 100644 index 0000000000..aa709cb293 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -0,0 +1,145 @@ +package eu.dnetlib.dhp.bulktag.eosc; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.bulktag.SparkBulkTagJob; +import eu.dnetlib.dhp.bulktag.community.*; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Context; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.print.attribute.DocAttributeSet; +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.TAGGING_TRUST; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; + +/** + * @author miriam.baglioni + * @Date 21/07/22 + */ +public class SparkEoscBulkTag implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class); + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkEoscBulkTag.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + String datasourceMapPath = parser.get("datasourceMapPath"); + log.info("datasourceMapPath: {}", datasourceMapPath); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + Class resultClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + CommunityConfiguration cc; + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + execBulkTag(spark, inputPath, outputPath, datasourceMapPath, resultClazz); + }); + } + + private static void execBulkTag( + SparkSession spark, + String inputPath, + String outputPath, + String datasourceMapPath, + Class resultClazz) { + + final List hostedByList = Arrays.asList(readPath(spark, datasourceMapPath, DatasourceMaster.class) + .map((MapFunction) dm -> dm.getMaster(), Encoders.STRING()) + .collect()); + + readPath(spark, inputPath, resultClazz) + .map(patchResult(), Encoders.bean(resultClazz)) + .filter(Objects::nonNull) + .map( + (MapFunction) value -> enrich(value, hostedByList), + Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static R enrich(R value, List hostedByList) { + if(value.getInstance().stream().anyMatch(i -> hostedByList.contains(i.getHostedby().getKey())) || + (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)){ + Context context = new Context(); + context.setId("eosc"); + OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE,true,false, + OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST); + value.getContext().add(context); + + } + return value; + + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + + // TODO remove this hack as soon as the values fixed by this method will be provided as NON null + private static MapFunction patchResult() { + return r -> { + if (r.getDataInfo().getDeletedbyinference() == null) { + r.getDataInfo().setDeletedbyinference(false); + } + if (r.getContext() == null) { + r.setContext(new ArrayList<>()); + } + return r; + }; + } + +} diff --git a/pom.xml b/pom.xml index 54070f654c..821ce3124a 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.0] + [2.12.1] [4.0.3] [6.0.5] [3.1.6] From ddc414b2586075f2ab32b59480115c826e9a863c Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 22 Jul 2022 09:43:15 +0200 Subject: [PATCH 11/32] fixed wrong json param --- .../eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json index f3e8cdbade..3a2f907089 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json @@ -1,6 +1,6 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true} + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}, {"paramName":"r", "paramLongName":"filterRelation", "paramDescription": "the relation to filter", "paramRequired": false} ] \ No newline at end of file From 7a1c1b6f53b8c7ca51633b440338fffc997e81bb Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jul 2022 14:36:02 +0200 Subject: [PATCH 12/32] [EOSC context TAG] Add test class and resourcesK --- .../dhp/bulktag/eosc/DatasourceMaster.java | 29 +- .../eosc/ReadMasterDatasourceFromDB.java | 62 ++-- .../dhp/bulktag/eosc/SparkEoscBulkTag.java | 229 +++++++------ .../bulktag/datasourcemaster_parameters.json | 32 ++ .../input_eosc_bulkTag_parameters.json | 34 ++ .../dhp/bulktag/oozie_app/workflow.xml | 139 +++++++- .../dhp/bulktag/EOSCContextTaggingTest.java | 152 +++++++++ .../dhp/bulktag/eosc/dataset/dataset_10.json | 10 + .../datasourceMaster | 318 ++++++++++++++++++ 9 files changed, 855 insertions(+), 150 deletions(-) create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java index 537356586d..ee01597e75 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/DatasourceMaster.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.bulktag.eosc; import java.io.Serializable; @@ -7,22 +8,22 @@ import java.io.Serializable; * @Date 21/07/22 */ public class DatasourceMaster implements Serializable { - private String datasource; - private String master; + private String datasource; + private String master; - public String getDatasource() { - return datasource; - } + public String getDatasource() { + return datasource; + } - public void setDatasource(String datasource) { - this.datasource = datasource; - } + public void setDatasource(String datasource) { + this.datasource = datasource; + } - public String getMaster() { - return master; - } + public String getMaster() { + return master; + } - public void setMaster(String master) { - this.master = master; - } + public void setMaster(String master) { + this.master = master; + } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java index 7d6964707d..9ad108749f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java @@ -1,27 +1,6 @@ package eu.dnetlib.dhp.bulktag.eosc; -/** - * @author miriam.baglioni - * @Date 21/07/22 - */ - -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.DbClient; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.common.RelationInverse; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - import java.io.BufferedWriter; import java.io.Closeable; import java.io.IOException; @@ -34,6 +13,28 @@ import java.util.List; import java.util.function.Consumer; import java.util.function.Function; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * @author miriam.baglioni + * @Date 21/07/22 + */ +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; + public class ReadMasterDatasourceFromDB implements Closeable { private final DbClient dbClient; @@ -43,18 +44,18 @@ public class ReadMasterDatasourceFromDB implements Closeable { private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String QUERY = "SELECT dso.id datasource, d.id master FROM " + - "(SELECT id FROM dsm_services WHERE id like 'eosc%') dso " + - "FULL JOIN " + - "(SELECT id, duplicate FROM dsm_dedup_services WHERE duplicate like 'eosc%')d " + - "ON dso.id = d.duplicate"; + "(SELECT id FROM dsm_services WHERE id like 'eosc%') dso " + + "FULL JOIN " + + "(SELECT id, duplicate FROM dsm_dedup_services WHERE duplicate like 'eosc%')d " + + "ON dso.id = d.duplicate"; public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - ReadMasterDatasourceFromDB.class + ReadMasterDatasourceFromDB.class .getResourceAsStream( - "/eu/dnetlib/dhp/blacklist/blacklist_parameters.json"))); + "/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json"))); parser.parseArgument(args); @@ -64,8 +65,9 @@ public class ReadMasterDatasourceFromDB implements Closeable { final String hdfsPath = parser.get("hdfsPath") + "/datasourceMasters"; final String hdfsNameNode = parser.get("hdfsNameNode"); - try (final ReadMasterDatasourceFromDB rmd = new ReadMasterDatasourceFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, - dbPassword)) { + try ( + final ReadMasterDatasourceFromDB rmd = new ReadMasterDatasourceFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, + dbPassword)) { log.info("Processing datasources..."); rmd.execute(QUERY, rmd::datasourceMasterMap); @@ -75,7 +77,7 @@ public class ReadMasterDatasourceFromDB implements Closeable { public void execute(final String sql, final Function producer) { - dbClient.processResults(sql, rs -> writeMap(producer.apply(rs))); + dbClient.processResults(sql, rs -> writeMap(producer.apply(rs))); } public DatasourceMaster datasourceMasterMap(ResultSet rs) { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java index aa709cb293..600a5cec8e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -1,18 +1,22 @@ + package eu.dnetlib.dhp.bulktag.eosc; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.bulktag.SparkBulkTagJob; -import eu.dnetlib.dhp.bulktag.community.*; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Context; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import static eu.dnetlib.dhp.PropagationConstant.readPath; +import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.TAGGING_TRUST; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import javax.print.attribute.DocAttributeSet; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -21,16 +25,15 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.print.attribute.DocAttributeSet; -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; -import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; -import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*; -import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.TAGGING_TRUST; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.bulktag.SparkBulkTagJob; +import eu.dnetlib.dhp.bulktag.community.*; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; /** * @author miriam.baglioni @@ -38,108 +41,124 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION */ public class SparkEoscBulkTag implements Serializable { - private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class); - public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class); + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SparkEoscBulkTag.class - .getResourceAsStream( - "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkEoscBulkTag.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); - String datasourceMapPath = parser.get("datasourceMapPath"); - log.info("datasourceMapPath: {}", datasourceMapPath); + String datasourceMapPath = parser.get("datasourceMapPath"); + log.info("datasourceMapPath: {}", datasourceMapPath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - Class resultClazz = (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - CommunityConfiguration cc; + SparkConf conf = new SparkConf(); + CommunityConfiguration cc; - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - execBulkTag(spark, inputPath, outputPath, datasourceMapPath, resultClazz); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, workingPath); + execBulkTag(spark, inputPath, workingPath, datasourceMapPath, resultClazz); + }); + } - private static void execBulkTag( - SparkSession spark, - String inputPath, - String outputPath, - String datasourceMapPath, - Class resultClazz) { + private static void execBulkTag( + SparkSession spark, + String inputPath, + String workingPath, + String datasourceMapPath, + Class resultClazz) { - final List hostedByList = Arrays.asList(readPath(spark, datasourceMapPath, DatasourceMaster.class) - .map((MapFunction) dm -> dm.getMaster(), Encoders.STRING()) - .collect()); - readPath(spark, inputPath, resultClazz) - .map(patchResult(), Encoders.bean(resultClazz)) - .filter(Objects::nonNull) - .map( - (MapFunction) value -> enrich(value, hostedByList), - Encoders.bean(resultClazz)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); - } + List hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class) + .map((MapFunction) dm -> dm.getMaster(), Encoders.STRING()) + .collectAsList(); - private static R enrich(R value, List hostedByList) { - if(value.getInstance().stream().anyMatch(i -> hostedByList.contains(i.getHostedby().getKey())) || - (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)){ - Context context = new Context(); - context.setId("eosc"); - OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE,true,false, - OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, - DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST); - value.getContext().add(context); - } - return value; - } - public static Dataset readPath( - SparkSession spark, String inputPath, Class clazz) { - return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); - } + readPath(spark, inputPath, resultClazz) + .map(patchResult(), Encoders.bean(resultClazz)) + .filter(Objects::nonNull) + .map( + (MapFunction) value -> enrich(value, hostedByList), + Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingPath); - // TODO remove this hack as soon as the values fixed by this method will be provided as NON null - private static MapFunction patchResult() { - return r -> { - if (r.getDataInfo().getDeletedbyinference() == null) { - r.getDataInfo().setDeletedbyinference(false); - } - if (r.getContext() == null) { - r.setContext(new ArrayList<>()); - } - return r; - }; - } + readPath(spark, workingPath, resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath); + + } + + private static R enrich(R value, List hostedByList) { + if (value.getInstance().stream().anyMatch(i -> hostedByList.contains(i.getHostedby().getKey())) || + (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) { + Context context = new Context(); + context.setId("eosc"); + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST); + value.getContext().add(context); + + } + return value; + + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + + // TODO remove this hack as soon as the values fixed by this method will be provided as NON null + private static MapFunction patchResult() { + return r -> { + if (r.getDataInfo().getDeletedbyinference() == null) { + r.getDataInfo().setDeletedbyinference(false); + } + if (r.getContext() == null) { + r.setContext(new ArrayList<>()); + } + return r; + }; + } } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json new file mode 100644 index 0000000000..9a2eadaa7d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "nn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the name node on hdfs", + "paramRequired": true + }, + { + "paramName": "pgurl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "pguser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": false + }, + { + "paramName": "pgpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json new file mode 100644 index 0000000000..ebbbd408b5 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json @@ -0,0 +1,34 @@ +[ + + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "dmp", + "paramLongName":"datasourceMapPath", + "paramDescription": "the path where the association datasource master has been stored", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index f34c5110ff..6016e0179e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -16,6 +16,21 @@ outputPath the output path + + + + postgresURL + the url of the postgress server to query + + + postgresUser + the username to access the postgres db + + + postgresPassword + the postgres password + + @@ -29,7 +44,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -226,10 +241,132 @@ --sourcePath${outputPath} --workingPath${workingDir}/eoscTag + + + + + + + eu.dnetlib.dhp.bulktag.eosc.ReadMasterDatasourceFromDB + --hdfsPath${workingDir}/datasourcemaster + --hdfsNameNode${nameNode} + --postgresUrl${postgresURL} + --postgresUser${postgresUser} + --postgresPassword${postgresPassword} + + + + + + + + + + + yarn-cluster + cluster + EOSC_tagging + eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${outputPath}/publication + --workingPath${workingDir}/eoscContextTag/publication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --datasourceMapPath${workingDir}/datasourcemaster + + + + + + + + yarn-cluster + cluster + EOSC_tagging + eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${outputPath}/dataset + --workingPath${workingDir}/eoscContextTag/dataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --datasourceMapPath${workingDir}/datasourcemaster + + + + + + + yarn-cluster + cluster + EOSC_tagging + eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${outputPath}/software + --workingPath${workingDir}/eoscContextTag/software + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --datasourceMapPath${workingDir}/datasourcemaster + + + + + + + yarn-cluster + cluster + EOSC_tagging + eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag + dhp-enrichment-${projectVersion}.jar + + --num-executors=${sparkExecutorNumber} + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${outputPath}/otherresearchproduct + --workingPath${workingDir}/eoscContextTag/otherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --datasourceMapPath${workingDir}/datasourcemaster + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java new file mode 100644 index 0000000000..d6785acc7d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java @@ -0,0 +1,152 @@ + +package eu.dnetlib.dhp.bulktag; +/** + * @author miriam.baglioni + * @Date 22/07/22 + */ + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc +//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc +//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc +//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag + +public class EOSCContextTaggingTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static final Logger log = LoggerFactory.getLogger(EOSCContextTaggingTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(EOSCContextTaggingTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(EOSCContextTaggingTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(EOSCTagJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void EoscContextTagTest() throws Exception { + + spark + .read() + .textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath()) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, Dataset.class), + Encoders.bean(Dataset.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingDir.toString() + "/input/dataset"); + + + SparkEoscBulkTag + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", + workingDir.toString() + "/input/dataset", + "-workingPath", workingDir.toString() + "/working/dataset", + "-datasourceMapPath", getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster").getPath(), + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset" + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + + Assertions + .assertEquals( + 4, + tmp + .filter( + s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); + + Assertions + .assertEquals(1, + tmp + .filter(d -> d.getId().equals("50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() + ); + Assertions + .assertEquals(1, + tmp + .filter(d -> d.getId().equals("50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() + ); + + + Assertions + .assertEquals(1, + tmp + .filter(d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() + ); + + Assertions + .assertEquals(1, + tmp + .filter(d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() + ); + } + + +} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json new file mode 100644 index 0000000000..63aa87e936 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json @@ -0,0 +1,10 @@ +{"author":[{"affiliation":[],"fullname":"Government of Alberta | Gouvernement de l'Alberta","name":"Government Of Alberta Gouvernement L.","pid":[],"rank":1,"surname":"Alberta"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-05-28"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:37.716Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"The number of people who are unemployed as a percentage of the active labour force (i.e. employed and unemployed)."},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Le nombre de personnes sans emploi en pourcentage de la population active (c'est-à-dire les personnes occupées et les chômeurs)."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-05-28"},"hostedby":{"key":"10|eosc________::7ef2576047f040612b983a27347471fc","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/f212a64f-92f0-430c-a04f-06436b1239d2"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599592929,"originalId":["50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea","oai:open.canada.ca:f212a64f-92f0-430c-a04f-06436b1239d2"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2013-05-28"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-05-28"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"labour"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Labour Force"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Employment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Unemployment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Labour"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Labor"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Industry Labour Force"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Labor Force"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Working Age Population"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Unemployment Rate"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Female Unemployment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Male Unemployment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Youth Unemployment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Emploi"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Chômage"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Population active"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Travail"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Main-d'œuvre"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Industrie Main-d'œuvre"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Population en âge de travailler"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Chômage féminin"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Chômage masculin"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Taux de chômage"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Le chômage des jeunes"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Unemployment Rate"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Taux de chômage"}]} +{"author":[{"affiliation":[],"fullname":"Jonard Pérez, Natalia","name":"Natalia","pid":[],"rank":1,"surname":"Jonard Pérez"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-30"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:46.467Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Non UBC"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Unreviewed"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Author affiliation: UNAM"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Faculty"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-30"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://creativecommons.org/licenses/by-nc-nd/4.0/"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["http://hdl.handle.net/2429/58601"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599600782,"originalId":["oai:open.library.ubc.ca-cIRcle-collections-birs:oai:circle.library.ubc.ca:2429/58601","50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"UBC cIRcle BIRS Workshop Lecture Videos"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2016-07-30"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-30"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Mathematics"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Convex and discrete geometry"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Differential geometry"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Functional analysis"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Group actions on hyperspaces of compact convex subsets of Rn"}]} +{"eoscifguidelines":[{"code":"EOSC::Jupyter Notebook","label":"EOSC::Jupyter Notebook","url":"","semanticRelation":"compliesWith"}],"author":[{"affiliation":[],"fullname":"Beacham, Terry D.","name":"Terry D.","pid":[],"rank":1,"surname":"Beacham"},{"affiliation":[],"fullname":"Araujo, H. Andres","name":"H. Andres","pid":[],"rank":2,"surname":"Araujo"},{"affiliation":[],"fullname":"Tucker, Strahan","name":"Strahan","pid":[],"rank":3,"surname":"Tucker"},{"affiliation":[],"fullname":"Trudel, Marc","name":"Marc","pid":[],"rank":4,"surname":"Trudel"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2019-06-11"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:47.799Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Size-selective mortality owing to lack of energy reserves during the first marine winter has been suggested to be a result of juvenile salmon failing to reach a critical size or condition by the end of their first marine summer and not surviving the following winter due to this presumed energy deficit. This hypothesis implies a knife-edge mortality function based upon size, and is subject to empirical data support for acceptance. Scale circulus spacing has been interpreted as an index for body size, and we reviewed the effect of size-selective mortality with a knife-edge mortality function on descriptive statistics for a scale circulus spacing index (SCSI). In order to invoke size selection as an important driver of mortality during the first year of ocean rearing, it is necessary to demonstrate not only that size-selective mortality is directed towards the smaller members of the population, but that the selective nature of the mortality can account for a substantial portion of the observed mortality. If the assumption is made that a random sample of a single juvenile population has been obtained, then studies that employ a SCSI to infer size-selective mortality coupled with a critical size limit must demonstrate a shift toward larger values of the SCSI, but also a concomitant reduction in the variance and range of the SCSI and an increase in the skewness and kurtosis of the SCSI values. Through simulation we found that the percentage of adults that displayed a SCSI value greater than the maximum observed in the juvenile sample was highly dependent on the initial juvenile sample size and size-selective mortality rate. Geographical distributions of juvenile Pacific salmon can be stratified by size, with larger individuals migrating earlier from local ocean entry locations than smaller individuals, and thus differential timing migration of juveniles based upon body size prior to the collection of the marine juvenile sample may be a more plausible explanation of published trends in the SCSI, rather than invoking substantial size-selective mortality and a critical size limit."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[{"box":"","place":"North Pacific","point":""}],"id":"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5061/dryad.n560cd7"}],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2019-06-11"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"https://creativecommons.org/publicdomain/zero/1.0/"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://dx.doi.org/10.5061/dryad.n560cd7","http://dx.doi.org/10.5061/dryad.n560cd7"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599625160,"originalId":["oai:datadryad.org:doi:10.5061/dryad.n560cd7","50|475c1990cbb2::3894c94123e96df8a21249957cf160cb"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dryad"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2019-06-11"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2019-06-11"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Pacific Salmon"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Oncorhynchus"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Size-selective mortality"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Data from: Validity of inferring size-selective mortality and a critical size limit in Pacific salmon from scale circulus spacing"}]} +{"author":[{"affiliation":[],"fullname":"Statistics Canada | Statistique Canada","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-06-22"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:48.54Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"

Canadian Internet use survey, Internet use by age group and household income quartile for Canada, provinces and census metropolitan areas (CMA) from 2010 and 2012.

"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"

Enquête canadienne sur l'utilisation d'Internet, utilisation d'Internet selon le groupe d'âge et le revenu du ménage, pour le Canada, les provinces et les régions métropolitaines de recensement (RMRs), de 2010 et 2012.

"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-06-22"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/9ec6e76a-6708-47e9-8753-c701f33a79c0"]},{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-06-22"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/9ec6e76a-6708-47e9-8753-c701f33a79c0"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1657046633905,"measures":[],"originalId":["50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7","oai:open.canada.ca:9ec6e76a-6708-47e9-8753-c701f33a79c0"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2021-06-22"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-06-22"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"science_and_technology"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"information_and_communications"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"table"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"digital economy and society"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"science and technology"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"digital technology and internet use"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"digital technology and internet use by individuals and households"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"tableau"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"économie et société numériques"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"sciences et technologie"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"utilisation d'internet et de la technologie numérique"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"utilisation d'internet et de la technologie numérique par les particuliers et les ménages"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_classes","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8226"},"qualifier":{"classid":"mesh","classname":"Medical Subject Headings","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"education"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Utilisation d'Internet, selon le groupe d'âge, le revenu du ménage et la géographie, inactif"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Internet use by age group, household income and geography, inactive"}]} +{"author":[{"affiliation":[],"fullname":"Natural Resources Canada | Ressources naturelles Canada","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2014-09-12"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:48.895Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This data provides the integrated cadastral framework for the specified Canada Land. The cadastral framework consists of active and superseded cadastral parcel, roads, easements, administrative areas, active lines, points and annotations. The cadastral lines form the boundaries of the parcels. COGO attributes are associated to the lines and depict the adjusted framework of the cadastral fabric. The cadastral annotations consist of lot numbers, block numbers, township numbers, etc. The cadastral framework is compiled from Canada Lands Survey Records (CLSR), Registration Plans (RS) and Location Sketches (LS) archived in the Canada Lands Survey Records."},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Ces données fournissent le cadastre intégré pour la Terre du Canada spécifiée. Le canevas cadastral contient les parcelles, les routes, servitudes, zone administratives, lignes cadastrales, annotations et les points. Les lignes cadastrales forment les limites des parcelles. Des attributs COGO sont rattachés à ces lignes et montrent le canevas ajusté du morcellement foncier. Les annotations comprennent les numéros de lot, les numéros de bloc, les numéros de township, etc. Le canevas cadastral est compilé à partir des archives d'arpentage des Terres du Canada (CLSR), des plans d'enregistrement (PE) et des croquis de localisation (CL) archivés dans les Archives d'arpentage des Terres du Canada."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[{"box":"","place":"","point":""}],"id":"50|475c1990cbb2::68405431da70bbb14c4c91d178f2d323","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2014-09-12"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/b39ef1c2-61b9-4ae3-9df4-2335051199cf"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599656458,"originalId":["oai:open.canada.ca:b39ef1c2-61b9-4ae3-9df4-2335051199cf","50|475c1990cbb2::68405431da70bbb14c4c91d178f2d323"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2014-09-12"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2014-09-12"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"economics_and_industry"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"form_descriptors"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"government_and_politics"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"nature_and_environment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"survey"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"boundaries"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"land management"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"national parks"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"canada lands"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"parcel"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"frontière"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"gestion des terres"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"arpentage"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"parcelle"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"terres du canada"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"parc national"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Cadastral Information for Elk Island National Park of Canada"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Information cadastrale pour le parc national Elk-Island du Canada"}]} +{"author":[{"affiliation":[],"fullname":"Statistics Canada | Statistique Canada","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-05-14"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:49.009Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This table is part of a series of tables that present a portrait of Canada based on the various census topics. The tables range in complexity and levels of geography. Content varies from a simple overview of the country to complex cross-tabulations; the tables may also cover several censuses."},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Ce tableau fait partie d'une série de tableaux qui présente un portrait du Canada selon les divers thèmes du recensement. Ces tableaux varient selon la complexité et les niveaux géographiques. Le contenu varie d'un simple aperçu du pays à des tableaux croisés plus complexes; les tableaux peuvent également présenter des données provenant de plusieurs recensements."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::79e6f2bbac44a48f0d09400f77222a9a","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-05-14"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/ac25a197-27dd-4270-b8fe-a1edf6399980"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599687856,"originalId":["oai:open.canada.ca:ac25a197-27dd-4270-b8fe-a1edf6399980","50|475c1990cbb2::79e6f2bbac44a48f0d09400f77222a9a"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2003-05-14"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-05-14"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"persons"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"census of population"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"recensement de la population"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Industry - 1997 North American Industry Classification System (23), Occupation - 2001 National Occupational Classification for Statistics (60), Class of Worker (12) and Sex (3) for Labour Force 15 Years and Over, for Canada, Provinces, Territories, Census Metropolitan Areas and Census Agglomerations, 2001 Census - 20% Sample Data"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Industrie - Système de classification des industries de l'Amérique du Nord de 1997 (23), profession - Classification nationale des professions pour statistiques de 2001 (60), catégorie de travailleurs (12) et sexe (3) pour la population active de 15 ans et plus, pour le Canada, les provinces, les territoires, les régions métropolitaines de recensement et les agglomérations de recensement, recensement de 2001 - Données-échantillon (20 %)"}]} +{"author":[{"affiliation":[],"fullname":"Statistics Canada | Statistique Canada","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2012-02-08"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:50.943Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Provides a statistical overview of various geographic areas based on a number of detailed variables."},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Fournit un aperçu statistique de régions géographiques variées à partir de plusieurs variables détaillées."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::7e2b404002a9802e61dc2ac8b4d850bf","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2012-02-08"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/5a676bc4-710f-4e8d-8c95-1a08869d2362"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599685542,"originalId":["50|475c1990cbb2::7e2b404002a9802e61dc2ac8b4d850bf","oai:open.canada.ca:5a676bc4-710f-4e8d-8c95-1a08869d2362"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2012-02-08"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2012-02-08"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"persons"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"census of population"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"recensement de la population"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Census Profile, 2011 Census (Dissemination Areas)"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Census Profile, Recensement 2011 (Aires de diffusion)"}]} +{"author":[{"affiliation":[],"fullname":"City of Calgary","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-02-15"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:47.835Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This dataset contains information on yearly updates for driving alone to work, accessibility to transit, access to daily needs & services, community belonging, greenhouse gas emissions, river withdrawals, active adults and urban canopy coverage"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|475c1990cbb2::81c9fa2d7250f26b846cbe61edd1a885","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-02-15"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://data.calgary.ca/d/cd4k-d8nb"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599601835,"originalId":["50|475c1990cbb2::81c9fa2d7250f26b846cbe61edd1a885","oai:data.calgary.ca:cd4k-d8nb"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"City of Calgary Open Data Portal"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2018-02-15"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-02-15"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Government"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Other QoL Indicators"}]} +{"author":[{"affiliation":[],"fullname":"Natural Resources Canada | Ressources naturelles Canada","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2010-12-31"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:31.179Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Many health professionals in Canada consider that obesity has reached or is nearing epidemic proportions in this country. Most scientific evidence is supportive of physical activity as a positive determinant. Greater health benefits can be achieved by increasing the amount; duration, frequency, or intensity-of physical activity. Lack of physical activity has negative health consequences. Population in the Atlantic Provinces tends to be more overweight and inactive, at least in relative terms."},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"De nombreux professionnels de la santé considèrent que l'obésité a atteint ou atteindra bientôt des proportions épidémiques au Canada. Selon la plupart des preuves scientifiques, l'activité physique a des effets bénéfiques sur la santé. L'augmentation de l'activité physique (durée, fréquence ou intensité) peut entraîner des bienfaits accrus sur le plan de la santé. Le manque d'activité physique nuit à la santé. Les taux de surpoids et d'inactivité sont généralement plus élevés, du moins en termes relatifs, dans les provinces de l'Atlantique."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[{"box":"","place":"","point":""}],"id":"50|475c1990cbb2::827655d9c8b98cba14d7ab81c44f6aba","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2010-12-31"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"http://open.canada.ca/en/open-government-licence-canada"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.canada.ca/data/en/dataset/f0c8dade-8893-11e0-aed7-6cf049291510"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599587045,"originalId":["oai:open.canada.ca:f0c8dade-8893-11e0-aed7-6cf049291510","50|475c1990cbb2::827655d9c8b98cba14d7ab81c44f6aba"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Open Data Canada"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2010-12-31"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2010-12-31"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"health_and_safety"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"map"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"health"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"obesity"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"carte"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"santé"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"obésité"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Health Behaviours: Population Who Were Overweight, 1996 to 1997"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"translated title","classname":"translated title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Comportements liés à la santé - Population avec un surpoids en 1996 à 1997"}]} +{"author":[{"affiliation":[],"fullname":"Agriculture and Forestry","name":"","pid":[],"rank":1,"surname":""}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Federated Research Data Repository"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Dépôt fédéré de données de recherche"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-13"},"dateofcollection":"2022-06-26T00:09:49+0000","dateoftransformation":"2022-06-26T01:48:49.386Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This map displays the risk of soil degradation by wind in the agricultural region of Alberta. Wind erosion is a concern because it reduces soil quality by removing soil nutrients, smaller soil particles and organic matter. Wind erosion can reduce air quality during extreme erosion events and also reduce water quality if eroded particles drift into streams and lakes. The map uses five classes to describe the wind erosion risk on bare, unprotected mineral soil: negligible, low, moderate, high and severe. This resource was created using ArcGIS. It was originally published as a print map in 1989."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[{"box":"","place":"","point":""}],"id":"50|475c1990cbb2::a40eb7c51d3c3f3947018b436db8ea55","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-13"},"hostedby":{"key":"10|openaire____::3795d6478e30e2c9f787d427ff160944","value":"Federated Research Data Repository / Dépôt fédéré de données de recherche"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://open.alberta.ca/opendata/8d6ff780-11e0-4f8a-96aa-d04eeebae458"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1656599638434,"originalId":["oai:open.alberta.ca:gda-10f9c423-e216-4234-85d8-2005f9d263c9","50|475c1990cbb2::a40eb7c51d3c3f3947018b436db8ea55"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Government of Alberta Open Data Portal"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2016-07-13"}],"resourcetype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"storagedate":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-07-13"},"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Environment"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Energy and Natural Resources"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Alberta"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"geoscientificInformation"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Downloadable Data"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"climatologyMeteorologyAtmosphere"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"soil fertility"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"desertification"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"soil movement"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Wind Erosion Risk"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster new file mode 100644 index 0000000000..935b25477d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster @@ -0,0 +1,318 @@ +{"datasource":"eosc________::100percentit::100percentit.100_percent_it_trusted_cloud","master":"10|eosc________::7ef2576047f040612b983a27347471fc"} +{"datasource":"eosc________::altec::altec.space-vis_adn_service","master":"10|eosc________::2946c48bbcc514ad76bbbf727d5d8fbc"} +{"datasource":"eosc________::astron::astron.","master":"10|eosc________::acb262d4bfdeb6aa9b463a4a6d0d662a"} +{"datasource":"eosc________::athena::athena.atmo-flud","master":"10|eosc________::ac448975e1d7f8b0266c8bb3b3992029"} +{"datasource":"eosc________::athena::athena.uw-map","master":"10|eosc________::5f2a401cf8ce9dc22a3776cea519b594"} +{"datasource":"eosc________::athena::athena.verbal_aggression_analyser_va_analyser","master":"10|eosc________::8b26233e89a50e3754972b1341130494"} +{"datasource":"eosc________::authenix::authenix.authenix","master":"10|eosc________::3cd84764da5728473593a580efb29a40"} +{"datasource":"eosc________::bineo::bineo.cos4bio","master":"10|eosc________::903e0526a6e56eeaf0e4561aa862ecb8"} +{"datasource":"eosc________::blue-cloud::blue-cloud.phytoplankton_eovs","master":"10|eosc________::c2438d79b48baf817956f3856877b3b8"} +{"datasource":"eosc________::bsc-es::bsc-es.bdrc_-_barcelona_dust_regional_center","master":"10|eosc________::756664ca614118315840eb8e985e4377"} +{"datasource":"eosc________::bsc-es::bsc-es.openebench","master":"10|eosc________::69ed72b873b803feed5ba6ae47548419"} +{"datasource":"eosc________::capsh::capsh.dissemin","master":"10|eosc________::e81587742e4107ce83723df17c27cb40"} +{"datasource":"eosc________::carlzeissm::carlzeissm.aper","master":"10|eosc________::f3beb9ee5ee293b723e2edd6f990fde3"} +{"datasource":"eosc________::ccsd::ccsd.episciences","master":"10|eosc________::e1e9de0dbf4bce79c49338d7cf9327e2"} +{"datasource":"eosc________::cds::cds.simbad_simbad_astronomical_database_provides_basic_data_cross-identifications_bibliography_and_measurements_for_astronomical_objects_outside_the_solar_system","master":"10|eosc________::a1e41e71453ac32161f4ac3f5c0e0421"} +{"datasource":"eosc________::centerdata::centerdata.surveycodingsorg","master":"10|eosc________::72db73ab253727c889905da50f506d10"} +{"datasource":"eosc________::cesga::cesga.finisterrae","master":"10|eosc________::6af4303d93f72744cc4c3c815ed2c9a0"} +{"datasource":"eosc________::cesnet::cesnet.metacentrum_cloud","master":"10|eosc________::cebfaa2d0b93502d56a8fbeb6b66cfbe"} +{"datasource":"eosc________::cesnet::cesnet.object_based_storage","master":"10|eosc________::1c5b55339bb86ff997a256d42d7be4b0"} +{"datasource":"eosc________::cesnet::cesnet.umsa_-_untargeted_mass_spectrometry_data_analysis","master":"10|eosc________::d928868211759352cb1604713e0347ec"} +{"datasource":"eosc________::cessda-eric::cessda-eric.cessda_data_catalogue","master":"10|fairsharing_::936824c0191953647ec609b4f49bc964"} +{"datasource":"eosc________::cessda-eric::cessda-eric.data_management_expert_guide_dmeg","master":"10|eosc________::22c14aaf31fc64424fa97adffe6380b9"} +{"datasource":"eosc________::cessda-eric::cessda-eric.elsst__european_language_social_science_thesaurus","master":"10|eosc________::5b30e057381cf0200dc2cdc7b562f570"} +{"datasource":"eosc________::cines::cines.etdr","master":"10|eosc________::3b7f7d6aafb0154025330183d59ce670"} +{"datasource":"eosc________::clarin-eric::clarin-eric.language_resource_switchboard","master":"10|eosc________::3531aa80dbe2b1018133b510a933de40"} +{"datasource":"eosc________::clarin-eric::clarin-eric.virtual_collection_registry","master":"10|eosc________::454e4f7f9f53d9dacf9dc3ba27902f16"} +{"datasource":"eosc________::clarin-eric::clarin-eric.virtual_language_observatory","master":"10|eosc________::4db0c877190783461728c6714cb66cbc"} +{"datasource":"eosc________::cloudferro::cloudferro.data_collections_catalog","master":"10|eosc________::eba1540eb9e87231fdf366eb23d16c3a"} +{"datasource":"eosc________::cloudferro::cloudferro.data_related_services_-_eo_browser","master":"10|eosc________::c24ebda20485c08293b72561ee3c634b"} +{"datasource":"eosc________::cloudferro::cloudferro.data_related_services_-_eo_finder","master":"10|eosc________::3d68186239b6c0f0d677ff55d9b549d1"} +{"datasource":"eosc________::cloudferro::cloudferro.infrastructure","master":"10|eosc________::ac7e3c0151fa3f11d3a7739dddaa3416"} +{"datasource":"eosc________::cmcc::cmcc.enes_data_space","master":"10|eosc________::2925e4df4147819e5b5d2f886f40e3a2"} +{"datasource":"eosc________::cnb-csic::cnb-csic.3dbionotes-ws_web_application_to_annotate_biochemical_and_biomedical_information_onto_structural_models","master":"10|eosc________::77fe0a66415f2440ab60d47dcee678a5"} +{"datasource":"eosc________::cnb-csic::cnb-csic.scipioncloud","master":"10|eosc________::7f09b7fee99363813f24aca9ebdecf61"} +{"datasource":"eosc________::cnr-iia::cnr-iia.geo_dab","master":"10|eosc________::108b0148352c15ee1ce935699e09add3"} +{"datasource":"eosc________::collabwith::collabwith.collabwith_marketplace","master":"10|eosc________::894a0ffa7768b228c1b46793670c85e6"} +{"datasource":"eosc________::coronis_computing_sl::coronis_computing_sl.uw-mos","master":"10|eosc________::9cbf0a75d817e291771b8bce6440f5f4"} +{"datasource":"eosc________::coronis_computing_sl::coronis_computing_sl.vd-maps","master":"10|eosc________::b5af1514b39d8e021554a73076a694d9"} +{"datasource":"eosc________::creaf::creaf.nimmbus_geospatial_user_feedback","master":"10|eosc________::86c325db16448760b3390dda7e46631a"} +{"datasource":"eosc________::creatis::creatis.virtual_imaging_platform","master":"10|eosc________::01a45ac2677f89414af91e651735846d"} +{"datasource":"eosc________::cs_group::cs_group.ai4geo_engine","master":"10|eosc________::c61211295d27e5e08f4c64f3e3098294"} +{"datasource":"eosc________::csc-fi::csc-fi.chipster","master":"10|eosc________::61549f785a2c93939be011b0453a6981"} +{"datasource":"eosc________::csc-fi::csc-fi.cpouta","master":"10|eosc________::d71c843b4e00eff17db07bf9d10769f9"} +{"datasource":"eosc________::csc-fi::csc-fi.csc_epouta","master":"10|eosc________::4493bd6a93e5b8465fda8cf7ab2dfdea"} +{"datasource":"eosc________::csc-fi::csc-fi.rahti_container_cloud","master":"10|eosc________::cc60eb9fc76f9598ee581eff0792573b"} +{"datasource":"eosc________::cscs::cscs.object_storage","master":"10|eosc________::3da6a817fe85ef43f7d97ef07e467d45"} +{"datasource":"eosc________::csi_piemonte::csi_piemonte.nivola2","master":"10|eosc________::ac6483be3e556c8652b8595680795983"} +{"datasource":"eosc________::csic::csic.csic_cloud_infrastructure","master":"10|eosc________::05ea2eb193382e22f32b32fbe9a4d961"} +{"datasource":"eosc________::cyberbotics::cyberbotics.robotbenchmark","master":"10|eosc________::27ee094c68b7a758ca2915aca6215a1d"} +{"datasource":"eosc________::d4science::d4science.alien_and_invasive_species_vre","master":"10|eosc________::b5cff6d55dcf6c20e78a0f1f847b3005"} +{"datasource":"eosc________::d4science::d4science.rprototypinglab_virtual_research_environment","master":"10|eosc________::8073ab0dbb22dc3b9f17627a7b25903f"} +{"datasource":"eosc________::d4science::d4science.visual_media_service_vre","master":"10|eosc________::eabf459f53c2bfe6247f006fcc0f4db7"} +{"datasource":"eosc________::dariah_eric::dariah_eric.dariah-campus","master":"10|eosc________::9c63075d6642a2d269776c2b90c2f976"} +{"datasource":"eosc________::dariah_eric::dariah_eric.ssh_open_marketplace","master":"10|eosc________::91fe494a3c21805febb03353152f1212"} +{"datasource":"eosc________::datacite::datacite.datacite_doi_registration_service","master":"10|eosc________::c146a470f01ee7ded3b55acda9362e7f"} +{"datasource":"eosc________::dcc-uk::dcc-uk.dmponline","master":"10|eosc________::fe480090e0739dab86b24a11177eeffd"} +{"datasource":"eosc________::denbi::denbi.cloud","master":"10|eosc________::59399e560967488c0ae0329e0d37f5b4"} +{"datasource":"eosc________::desy::desy.pan_data","master":"10|eosc________::52008fe404bf2e939140109162f9233f"} +{"datasource":"eosc________::desy::desy.pan_faas","master":"10|eosc________::026939c4b12d7d71e2b05bc5acde804e"} +{"datasource":"eosc________::desy::desy.pan_gitlab","master":"10|eosc________::f13cefc9f3207cb82f3285b05f190f78"} +{"datasource":"eosc________::desy::desy.pan_notebook","master":"10|eosc________::500fe61cce6562797cd43797aab12be5"} +{"datasource":"eosc________::digitalglobe::digitalglobe.earthwatch","master":"10|eosc________::020d905260267066c1926f526bb86f30"} +{"datasource":"eosc________::dkrz::dkrz.enes_climate_analytics_service","master":"10|eosc________::1d7a1fea6694d15d9e67f08e1e77082b"} +{"datasource":"eosc________::doabf::doabf.operas_certification","master":"10|eosc________::79b9748edeffb872a28660a9d238dcec"} +{"datasource":"eosc________::ds-wizard::ds-wizard.data_stewardship_wizard","master":"10|eosc________::fc6bad963e15e218efc62c7befd122af"} +{"datasource":"eosc________::egi-fed::egi-fed.check-in","master":"10|eosc________::baa3c497b9499b3d8c87ea8d2b37a44f"} +{"datasource":"eosc________::egi-fed::egi-fed.cloud_compute","master":"10|eosc________::b1179384a336d409fc909fe3711d3d1f"} +{"datasource":"eosc________::egi-fed::egi-fed.cloud_container_compute","master":"10|eosc________::a66bb1ac56a3bcf2c24b0ef85ed2bdfc"} +{"datasource":"eosc________::egi-fed::egi-fed.data_transfer","master":"10|eosc________::6c0bf38e885c42161b88093517f6cd3e"} +{"datasource":"eosc________::egi-fed::egi-fed.egi_datahub","master":"10|eosc________::5a260dae80795584ac08df133adb1fad"} +{"datasource":"eosc________::egi-fed::egi-fed.fitsm_training","master":"10|eosc________::927b4455c0a21692d2a9f634bccd8309"} +{"datasource":"eosc________::egi-fed::egi-fed.high-throughput_compute","master":"10|eosc________::e27ec11ac7b7d6ffbbce668b7d1f81d5"} +{"datasource":"eosc________::egi-fed::egi-fed.iso_27001_training","master":"10|eosc________::98a6655b6421166c5c29baa2f5815de3"} +{"datasource":"eosc________::egi-fed::egi-fed.notebook","master":"10|eosc________::1d37909a6a31147a09ee9f2e579a6706"} +{"datasource":"eosc________::egi-fed::egi-fed.online_storage","master":"10|eosc________::d8b94284582d3e2185a782ae2ba42186"} +{"datasource":"eosc________::egi-fed::egi-fed.training_infrastructure","master":"10|eosc________::38cdb8e44638f2e561c466f0dd26cf96"} +{"datasource":"eosc________::egi-fed::egi-fed.workload_manager","master":"10|eosc________::ff515071cd88afb40599edcb6637f47e"} +{"datasource":"eosc________::ehri::ehri.begrenzte_flucht","master":"10|eosc________::01d1445605fc1d25e6a7f21ba995d724"} +{"datasource":"eosc________::ehri::ehri.diplomatic_reports","master":"10|eosc________::11714353d2ed069ca30b177d4b4d9e0f"} +{"datasource":"eosc________::ehri::ehri.early_holocaust_testimony","master":"10|eosc________::0a4974b0bb295b98f88cb7c793f91c17"} +{"datasource":"eosc________::ehri::ehri.ehri_document_blog","master":"10|eosc________::fb9291f8dac099986eafe957b169ed97"} +{"datasource":"eosc________::ehri::ehri.international_research_portal_for_records_related_to_nazi-era_cultural_property","master":"10|eosc________::01c5b10e57f9cbb4f3125f427375487e"} +{"datasource":"eosc________::ehri::ehri.the_ehri_portal","master":"10|eosc________::6ad4d5352fd192b5fecd76bbd7a7e8b7"} +{"datasource":"eosc________::eiscat::eiscat.eiscat_data_access_portal","master":"10|eosc________::0f06a55c8333ae4d197c1d263b2be6ba"} +{"datasource":"eosc________::elixir-italy::elixir-italy.laniakea_recas","master":"10|eosc________::01e84abe377339ea57ed521ac39130e9"} +{"datasource":"eosc________::elixir-uk::elixir-uk.cyverse_uk","master":"10|eosc________::6a6a05847befec6587bef7673112f5e5"} +{"datasource":"eosc________::elixir-uk::elixir-uk.workflowhub","master":"10|fairsharing_::c8cd63e1bf13c5016881652983fb615a"} +{"datasource":"eosc________::elsevier::elsevier.digital_commons","master":"10|eosc________::67d38b6a1f43184676b113369554676b"} +{"datasource":"eosc________::embl-ebi::embl-ebi.embassy_cloud","master":"10|eosc________::7f8b24797312b851916ee1be0f836de6"} +{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg","master":"10|eosc________::564e9f467aad251143e12e2e6ec19768"} +{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg_central_registry","master":"10|eosc________::441caf7eaa4a6602aceae36b2697b924"} +{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg_resolution_services","master":"10|eosc________::8df6273a1cb2289dbbe3a4b5fe05aa53"} +{"datasource":"eosc________::emso_eric::emso_eric.emso_eric_data_portal","master":"10|eosc________::94a41630bd9ddea4a88ec0bfba1b9d95"} +{"datasource":"eosc________::enermaps::enermaps.enermaps_data_management_tool","master":"10|eosc________::11496ee8a69b4b955200da7f2c12fe3b"} +{"datasource":"eosc________::enhancer::enhancer.openrdmeu","master":"10|eosc________::04820bece2545235144903dec056bcbd"} +{"datasource":"eosc________::enhancer::enhancer.swiss_escience_grid_certificates","master":"10|eosc________::4968516eb3b1ad6d883e74a84827e963"} +{"datasource":"eosc________::eodc::eodc.data_catalogue_service","master":"10|eosc________::21c44a2b6946e02300dbe36a8edec650"} +{"datasource":"eosc________::eodc::eodc.jupyterhub_for_global_copernicus_data","master":"10|eosc________::f99ccd68bf3de6a0a3b0db3441a41bbd"} +{"datasource":"eosc________::eosc-dih::eosc-dih.piloting_and_co-design_of_the_business_pilots","master":"10|eosc________::178f3e4832afe9e477d761d2f3d95f85"} +{"datasource":"eosc________::eox::eox.edc_eoxhub_workspace","master":"10|eosc________::d71468878e069cf484fc988d276c6d9a"} +{"datasource":"eosc________::esa-int::esa-int.geoss_web_portal","master":"10|eosc________::d7bac1ce234c20e3ab43a74eefa34782"} +{"datasource":"eosc________::esrf::esrf.the_european_synchrotron_radiation_facility_data_portal","master":"10|fairsharing_::2996962656838a97af4c5f926fe6f1b0"} +{"datasource":"eosc________::ess::ess.pan-learning-org","master":"10|eosc________::1298286d3a7cc48fa525b118218c7836"} +{"datasource":"eosc________::ess_eric::ess_eric.european_social_survey_ess_as_a_service","master":"10|eosc________::faa60b95b602690861be9305812a5c07"} +{"datasource":"eosc________::eudat::eudat.b2access","master":"10|eosc________::4dee0695b946b545dc8d52c56598fbbf"} +{"datasource":"eosc________::eudat::eudat.b2drop","master":"10|eosc________::4c6a514f1392ac1d159214e61785849a"} +{"datasource":"eosc________::eudat::eudat.b2find","master":"10|eosc________::6069f46dfcc89ccf8043581c9034558e"} +{"datasource":"eosc________::eudat::eudat.b2handle","master":"10|eosc________::a23be7f6265fd1ad957eed16b5c8bdc4"} +{"datasource":"eosc________::eudat::eudat.b2note","master":"10|eosc________::dfd1d6816b4182e25e84f6cf10d108ed"} +{"datasource":"eosc________::eudat::eudat.b2safe","master":"10|re3data_____::a632666349a0bb9a36096c9e152d34cc"} +{"datasource":"eosc________::eudat::eudat.b2share","master":"10|eosc________::f959324bdb00f052d547b95da205062f"} +{"datasource":"eosc________::eurac::eurac.edp-portal_-_metadata_catalogue_of_eurac_research","master":"10|eosc________::274d73061a925a29d8743b3e1022d0dc"} +{"datasource":"eosc________::europeana::europeana.europeana_apis","master":"10|eosc________::91de8c90ebde3dc1c8d41f339fe3fac7"} +{"datasource":"eosc________::exoscale::exoscale.european_cloud_hosting","master":"10|eosc________::12b7e6fef784084b817a42f2990fe3f2"} +{"datasource":"eosc________::expertai::expertai.document_enrichment_api","master":"10|eosc________::6812b902471f12506c8e6441195aff57"} +{"datasource":"eosc________::expertai::expertai.recommender_api","master":"10|eosc________::c40634543c1217686f0a8f5e8592d100"} +{"datasource":"eosc________::expertai::expertai.search_api","master":"10|eosc________::79440bc8082949f56cbabef796cec7f1"} +{"datasource":"eosc________::fairdi::fairdi.nomad_repository","master":"10|eosc________::b9000c95a6fde9930ae74f4071e14cb2"} +{"datasource":"eosc________::figshare::figshare.figshare","master":"10|eosc________::5e6bd062c6b85e2d176b2e61636b8971"} +{"datasource":"eosc________::forschungsdaten::forschungsdaten.forschungsdateninfo","master":"10|eosc________::c9185fdb68af7d515e56054da546bc94"} +{"datasource":"eosc________::forth::forth.openbioeu","master":"10|eosc________::2db71171816e994877fb960b9fcd89f2"} +{"datasource":"eosc________::fssda::fssda.data_service_portal_aila","master":"10|eosc________::ef1f75ea6d244563bc6cfb0c3d3affa4"} +{"datasource":"eosc________::fssda::fssda.kuha2_metadata_server","master":"10|eosc________::b6af28d7c292dbbe816cd0d6a9a66f16"} +{"datasource":"eosc________::gbif-es::gbif-es.collections_registry","master":"10|eosc________::ac6da0cfbd07f8605c57a799c41dc947"} +{"datasource":"eosc________::gbif-es::gbif-es.e-Learning_platform","master":"10|eosc________::9059ca88ca8292881ffba9ad8d943d04"} +{"datasource":"eosc________::gbif-es::gbif-es.images_portal","master":"10|eosc________::6991e5dd230956156129669934798cd8"} +{"datasource":"eosc________::gbif-es::gbif-es.occurrence_records","master":"10|eosc________::948a9a53e2a9c94d32f99785eccff662"} +{"datasource":"eosc________::gbif-es::gbif-es.regions_module","master":"10|eosc________::11189c308854c8d8113161edc7fbd3de"} +{"datasource":"eosc________::gbif-es::gbif-es.spatial_portal","master":"10|eosc________::665f73f5e4b6a3693fec9426a6ce6ae8"} +{"datasource":"eosc________::gbif-es::gbif-es.species_portal","master":"10|eosc________::9fe2f2ccb3d17452bd6e7424f60340ce"} +{"datasource":"eosc________::gbif::gbif.gbif_species_occurrence_data","master":"10|fairsharing_::6e5025ccc7d638ae4e724da8938450a6"} +{"datasource":"eosc________::gbif_portugal::gbif_portugal.gbif_portugal_occurrence_records","master":"10|eosc________::fcd4f4efdecb4e675fdee043043f69fc"} +{"datasource":"eosc________::gcc_umcg::gcc_umcg.molgenis","master":"10|eosc________::7f255ebbb3715f258e8d7c470209e675"} +{"datasource":"eosc________::geant::geant.clouds_service_infrastructure_as_a_service","master":"10|eosc________::7debc69506a8019515d350707e8c82d7"} +{"datasource":"eosc________::geant::geant.edugain","master":"10|eosc________::3ded12106e7e870242f7ec39345b3b97"} +{"datasource":"eosc________::geant::geant.edumeet_-_webbased_videoconferencing_platform","master":"10|eosc________::dcf8b262f7f61d44eedf409a29d30abc"} +{"datasource":"eosc________::geant::geant.eduroam","master":"10|eosc________::e7fd04aab1f224aaa2b5d3478694748b"} +{"datasource":"eosc________::geant::geant.eduteams","master":"10|eosc________::f3b04fa1e741f17a842fcbea35e04318"} +{"datasource":"eosc________::geant::geant.eduvpn_-_access_your_institutes_network_or_the_internet_using_an_encrypted_connection","master":"10|eosc________::aeb7c573f2742ec5ef8b7332b6b614cb"} +{"datasource":"eosc________::geant::geant.inacademia","master":"10|eosc________::26cb3be539a5bbb25533d3b1bdb9d6aa"} +{"datasource":"eosc________::geant::geant.ip","master":"10|eosc________::59cd8dbce2703f4eea69a54a959aae89"} +{"datasource":"eosc________::geant::geant.l3vpn","master":"10|eosc________::1e70cff61071ce42baffa6dafaf3165e"} +{"datasource":"eosc________::geant::geant.lambda","master":"10|eosc________::20a8114b376bf4c455c034b7b4513805"} +{"datasource":"eosc________::geant::geant.mdvpn","master":"10|eosc________::54fbf0ac4e42a2ce51e400d9783b51ba"} +{"datasource":"eosc________::geant::geant.open","master":"10|eosc________::9ae24d8c63e9ff986fbd20705b334919"} +{"datasource":"eosc________::geant::geant.perfsonar","master":"10|eosc________::1bdda4f743377914fabd0f365a8b6ee2"} +{"datasource":"eosc________::geant::geant.plus","master":"10|eosc________::eef45e860d52aff4932f254599d5b713"} +{"datasource":"eosc________::geant::geant.transits_training","master":"10|eosc________::831e2b596060c60d7d4bc79c200a2254"} +{"datasource":"eosc________::geant::geant.trusted_certificate_service","master":"10|eosc________::30817adfb6c625d7fd36b657e2fabc74"} +{"datasource":"eosc________::geant::geant.wifimon","master":"10|eosc________::6116f3b14f34658593529f6810068c4e"} +{"datasource":"eosc________::genias::genias.e-irg_knowledge_base","master":"10|eosc________::ddc5ab67fed353917716eb2d5c86ce68"} +{"datasource":"eosc________::gesis::gesis.doi_registration_service","master":"10|eosc________::71f37a7ebd8495a59c46e637ee5463da"} +{"datasource":"eosc________::grnet::grnet.agora_resource_portfolio_management_tool","master":"10|eosc________::461aa754c52b7eed605f9e0955470de5"} +{"datasource":"eosc________::grnet::grnet.argo_monitoring_engine","master":"10|eosc________::e91a3b4dfb62113b9b67b0ac97e566b4"} +{"datasource":"eosc________::grnet::grnet.aris","master":"10|eosc________::6b381464ec768e3cf55ccacdb00b5988"} +{"datasource":"eosc________::grnet::grnet.aris_-_archival_service","master":"10|eosc________::32158f91e33cf6fb6c63561cbc7ffd24"} +{"datasource":"eosc________::grnet::grnet.ni4os-europe_login","master":"10|eosc________::aeaa8f7fc2948930bfa4f970cd96837e"} +{"datasource":"eosc________::grnet::grnet.ni4os-europe_repository_service","master":"10|eosc________::d6933cb7acd6fa7a2f7a42562c432fb5"} +{"datasource":"eosc________::grycap::grycap.elastic_cloud_compute_cluster","master":"10|eosc________::c6d3c380ce5499d8d20cc9bbeb3b43ff"} +{"datasource":"eosc________::grycap::grycap.infrastructure_manager","master":"10|eosc________::e8a2eeb06a205c3299af49f5c233ce16"} +{"datasource":"eosc________::grycap::grycap.saps_surface_energy_balance_automated_processing_service","master":"10|eosc________::a7ae875b2487576c35f1bc8e1c857c14"} +{"datasource":"eosc________::hn::hn.isidore","master":"10|re3data_____::fabe5c1aaa2e2d4c847e01647b87bf60"} +{"datasource":"eosc________::hostkey::hostkey.gpu_servers_grant_program","master":"10|eosc________::d45f87107eb536b4be97e112fac15787"} +{"datasource":"eosc________::icos_eric::icos_eric.data_discovery_and_access_portal","master":"10|eosc________::84ada2e91828ce72fa6d02736cdd90f1"} +{"datasource":"eosc________::ifca-csic::ifca-csic.deepaas_training_facility","master":"10|eosc________::5414e2342e67d64b11b835e7fd58869d"} +{"datasource":"eosc________::ifca-csic::ifca-csic.ifca-csic_cloud_infrastructure","master":"10|eosc________::838e5c334e8115e4831d5f21435aa19b"} +{"datasource":"eosc________::ifca-csic::ifca-csic.plant_classification","master":"10|eosc________::32c26f83acaef8d89cc6c7a2f8abd198"} +{"datasource":"eosc________::ifca-csic::ifca-csic.remote_monitoring_and_smart_sensing","master":"10|eosc________::0335d29ec68ef9ebad8326cba79455f2"} +{"datasource":"eosc________::ifin-hh::ifin-hh.cloudifin","master":"10|eosc________::04d791df0b61b0f5060f241c70924991"} +{"datasource":"eosc________::iisas::iisas.dynamic_dns_service","master":"10|eosc________::2381e3b55d048130f2dffd437123d501"} +{"datasource":"eosc________::iisas::iisas.fedcloudclient_egi_fedcloud_client","master":"10|eosc________::3668885b6512a039673b9f4638c88600"} +{"datasource":"eosc________::iisas::iisas.modelling_service_for_water_supply_systems","master":"10|eosc________::b1d6d2cebddf52f6647102a30690fba9"} +{"datasource":"eosc________::ill::ill.ill_data_portal","master":"10|eosc________::714498cf1efec13c2206db4b1e4f1c30"} +{"datasource":"eosc________::ill::ill.panosc_software_catalogue","master":"10|eosc________::bc63c5a78abd38a7d9df043e0853a9ce"} +{"datasource":"eosc________::inaf::inaf.space-ml_caesar_service","master":"10|eosc________::ba42c5e4332ff16c6cd28573012bc2f9"} +{"datasource":"eosc________::inaf::inaf.space-vis_vialactea_service","master":"10|eosc________::ce2ca563bceae686b763326ed53e7b54"} +{"datasource":"eosc________::infn::infn.dynamic_on_demand_analysis_service","master":"10|eosc________::f884894e05c5a54646f0b5715e5495d6"} +{"datasource":"eosc________::infn::infn.fgsg_science_software_on_demand","master":"10|eosc________::452af4e76a64b6ee7e4bdc86527687f7"} +{"datasource":"eosc________::infn::infn.indigo_identity_and_access_management","master":"10|eosc________::d23115c40a4e256725f140330d001861"} +{"datasource":"eosc________::infn::infn.infn-cloud_object_storage_dice","master":"10|eosc________::fe0c28e8657cb84e3b775156106c03d1"} +{"datasource":"eosc________::infn::infn.paas_orchestrator","master":"10|eosc________::146240bb16057a93e11631edee570f76"} +{"datasource":"eosc________::infrafrontier::infrafrontier.training_in_mouse_functional_genomics","master":"10|eosc________::64d6597d10f4e617152f4a612a87eaba"} +{"datasource":"eosc________::inria::inria.software_heritage_archive","master":"10|fairsharing_::2c758933af02c0b301906f2819ae1268"} +{"datasource":"eosc________::jelastic::jelastic.platform-as-a-service","master":"10|eosc________::bfcae4ab00df41a3c43efbb879586e8f"} +{"datasource":"eosc________::kit::kit.eosc-performance","master":"10|eosc________::e52ab75587c1dd98db80568197f04586"} +{"datasource":"eosc________::kit::kit.o3as_ozone_assessment","master":"10|eosc________::aaf27a5f35a790617247abecd84b100f"} +{"datasource":"eosc________::komanord::komanord.guardomic","master":"10|eosc________::b1e06c9d2c472e9441ee72e83a934d40"} +{"datasource":"eosc________::lago::lago.onedatasim","master":"10|eosc________::2b2163e8b82320fed69a017a3e5fb657"} +{"datasource":"eosc________::lifewatch-eric::lifewatch-eric.plants_identification_app","master":"10|eosc________::6fc6ed0894391496d3c4967d45933d1a"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.elixirfm","master":"10|eosc________::6dd7c323776a028cef0619cb34bdf48c"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.ker_-_keyword_extractor","master":"10|eosc________::09915f038900aa43cb0c76aa89f10cda"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.lindatclariah-cz_repository","master":"10|eosc________::3daee6a29fb1d9a0f624cdd5973c33ea"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.machine_translation","master":"10|eosc________::3ae4551729381cfd03c433fb0de0c971"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.morphodita","master":"10|eosc________::f2ceebdc1a41d65504ff27f7297c833b"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.nametag","master":"10|eosc________::71e3226e7a868e2215335ffb29073285"} +{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.udpipe_tool_for_lemmatization_morphological_analysis_pos_tagging_and_dependency_parsing_in_multiple_languages","master":"10|eosc________::2dfc64c2951d9be3f1e2b576633ea425"} +{"datasource":"eosc________::lnec::lnec-pt.opencoasts_portal","master":"10|eosc________::7e99655aeda0b5f06efb3eea424dff54"} +{"datasource":"eosc________::lnec::lnec.worsica_-_water_monitoring_sentinel_cloud_platform","master":"10|eosc________::c2f55ab774c3cbbd9a330eebaa74dc36"} +{"datasource":"eosc________::materialscloud::materialscloud.aiiDA_lab","master":"10|eosc________::dfd970a812cf2e0298eb28c681bc109f"} +{"datasource":"eosc________::materialscloud::materialscloud.materials_cloud_archive","master":"10|fairsharing_::a431d70133ef6cf688bc4f6093922b48"} +{"datasource":"eosc________::meeo::meeo.adam_platform","master":"10|eosc________::b17fedb87dd9985b6a5e51db593446d6"} +{"datasource":"eosc________::meeo::meeo.adam_space","master":"10|eosc________::24bfbca4cf4fedc5a4a662fe67a30d7e"} +{"datasource":"eosc________::mobile_observation_integration_service::mobile_observation_integration_service.dark_sky_meter_datasource","master":"10|eosc________::160638e73224aeb7e4f98fd237672919"} +{"datasource":"eosc________::msw::msw.polaris_os","master":"10|eosc________::12348ba5b2c5902fd400cb3f1ab773ee"} +{"datasource":"eosc________::obp::obp.thoth","master":"10|eosc________::680198ec3f51a744de8a7603d542a0e1"} +{"datasource":"eosc________::openaire::openaire.amnesia","master":"10|eosc________::ac57e2dd5b3ee01909d7a592523bb96f"} +{"datasource":"eosc________::openaire::openaire.argos","master":"10|eosc________::92145beb3257af0510ee61ef10d16870"} +{"datasource":"eosc________::openaire::openaire.broker","master":"10|eosc________::c8c6e8d211d6df4ee8a187fa1134bd92"} +{"datasource":"eosc________::openaire::openaire.data_provider_dashboard","master":"10|eosc________::809d4c77a7acf9ac0cc2990d4264ae51"} +{"datasource":"eosc________::openaire::openaire.digital_humanities_and_cultural_heritage_openaire_community_gateway","master":"10|eosc________::b9110e9735dd467abc969fe8e2f1efa3"} +{"datasource":"eosc________::openaire::openaire.discovery_portal","master":"10|eosc________::992052173b689c8cea94e8e8d99f0238"} +{"datasource":"eosc________::openaire::openaire.european_marine_science_openaire_dashboard","master":"10|eosc________::950a99851df85c90ec2e933e1d55e164"} +{"datasource":"eosc________::openaire::openaire.funder_dashboard","master":"10|eosc________::196eea80ab9d73766cd2e8b6ab85872f"} +{"datasource":"eosc________::openaire::openaire.graph","master":"10|eosc________::c122caed52a88b57732b814a74141000"} +{"datasource":"eosc________::openaire::openaire.greek_sustainable_development_solutions_network_sdsn_openaire_dashboard","master":"10|eosc________::8100e41e3a5b18170bc5ede2cc393331"} +{"datasource":"eosc________::openaire::openaire.inference","master":"10|eosc________::c491811e9a6afa69cdcab0f92fca6f7b"} +{"datasource":"eosc________::openaire::openaire.neuroinformatics_openaire_dashboard","master":"10|eosc________::6e3adcce4d0d4229a9749584dfd5e7a8"} +{"datasource":"eosc________::openaire::openaire.open_science_helpdesk","master":"10|eosc________::d66db88d4c6c354fe7ebcd4c3dce334e"} +{"datasource":"eosc________::openaire::openaire.open_science_observatory","master":"10|eosc________::441ee64860eb79808b7cf0bb08262be6"} +{"datasource":"eosc________::openaire::openaire.open_science_training","master":"10|eosc________::99847506cdff50afa4945d60a9661ea3"} +{"datasource":"eosc________::openaire::openaire.openaire_login","master":"10|eosc________::818973a9375c0fa545499e1bb9ad0ab2"} +{"datasource":"eosc________::openaire::openaire.openapc","master":"10|eosc________::a28cc193bc938573e892b8aad0017702"} +{"datasource":"eosc________::openaire::openaire.research_community_dashboard","master":"10|eosc________::e1a866322f76407fb161a253dc5b539c"} +{"datasource":"eosc________::openaire::openaire.scholexplorer","master":"10|eosc________::6b34adede04121175566ef8c70f1e520"} +{"datasource":"eosc________::openaire::openaire.technical_support_towards_openaire_compliance","master":"10|eosc________::cdb8e94b386f9b6780a47194bd1bc7f7"} +{"datasource":"eosc________::openaire::openaire.topos_observatory_for_organisations","master":"10|eosc________::a7d2b95257273b5ea3f3a23fd8a60d48"} +{"datasource":"eosc________::openaire::openaire.usage_statistics","master":"10|eosc________::8aa345dc7321fc97906bf4c193a05a8f"} +{"datasource":"eosc________::openaire::openaire.validator","master":"10|eosc________::f2c13efbaa2a33af3e4e6a54805ac379"} +{"datasource":"eosc________::openaire::openaire.zenodo","master":"10|opendoar____::358aee4cc897452c00244351e4d91f69"} +{"datasource":"eosc________::openbiomaps::openbiomaps.openbiomaps","master":"10|eosc________::32edf5a4edbdea0899d6ba588d083efd"} +{"datasource":"eosc________::openedition::openedition.operas_research_for_society","master":"10|eosc________::2cdf4f57007b990b7ad7a884796f9b15"} +{"datasource":"eosc________::openknowledgemaps::openknowledgemaps.open_knowledge_maps","master":"10|eosc________::f3819d0f8e8bf57d383b23d31a3c0099"} +{"datasource":"eosc________::openminted::openminted.builder_of_tdm_applications","master":"10|eosc________::fdd26c19dd490260bc6c48b5813f4ac3"} +{"datasource":"eosc________::openminted::openminted.catalogue_of_ancillary_resources","master":"10|eosc________::ab4e37e85a1975b204b66683ed3888a8"} +{"datasource":"eosc________::openminted::openminted.catalogue_of_corpora","master":"10|eosc________::2cf744a594ea30fd31e976bffa8f2b71"} +{"datasource":"eosc________::openminted::openminted.catalogue_of_tdm_applications","master":"10|eosc________::ef5f343c5cf11fa2d40407ec308bb34a"} +{"datasource":"eosc________::openminted::openminted.catalogue_of_tdm_components","master":"10|eosc________::4275243a94677f19a5b74e5afb1f94cf"} +{"datasource":"eosc________::openminted::openminted.consulting_on_licences_for_tdm","master":"10|eosc________::522000b4c90b209aa7be961449ca910f"} +{"datasource":"eosc________::openminted::openminted.corpus_builder_for_scholarly_works","master":"10|eosc________::c64725d47af63bc2114b4214b684a392"} +{"datasource":"eosc________::openminted::openminted.support_and_training","master":"10|eosc________::84501ff99e5e429f5f083ab8ca0be7e4"} +{"datasource":"eosc________::openminted::openminted.tdm_applications_executor","master":"10|eosc________::e9ae655ce2ff1eaa19d0b3475ce5e660"} +{"datasource":"eosc________::operas::operas.gotriple_discovery_platform","master":"10|eosc________::f687e24dc56aaeeb561c95865a5071cc"} +{"datasource":"eosc________::operas::operas.operas_metrics_service","master":"10|eosc________::5960e1289f623625210f720c6173592d"} +{"datasource":"eosc________::oslo_university::oslo_university.services_for_sensitive_data_tsd","master":"10|eosc________::743b01351510f88e24be1c700c581f68"} +{"datasource":"eosc________::osmooc::osmooc.open_science_mooc","master":"10|eosc________::e101101e8653b6607a3ad9fea3b7d1fe"} +{"datasource":"eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing","master":"10|openaire____::bf5a61cc330e21ffa90eed3eb1533466"} +{"datasource":"eosc________::phenomenal::phenomenal.phenomenal","master":"10|eosc________::79e19b14aeee0d94e9a79110a6e6ad32"} +{"datasource":"eosc________::plantnet::plantnet.plntnet_identification_service","master":"10|eosc________::5ce89743eafdd8578591d84150f547e4"} +{"datasource":"eosc________::prace::prace.application_enabling_support","master":"10|eosc________::c87fd74ed685337fdbcff504373fc513"} +{"datasource":"eosc________::prace::prace.code_vault","master":"10|eosc________::dbab7889c81b59ec753040a762f6569a"} +{"datasource":"eosc________::prace::prace.deci_access","master":"10|eosc________::c7cedb82b1beea5382601d48807212aa"} +{"datasource":"eosc________::prace::prace.mooc","master":"10|eosc________::d6ff8167d31dccebe33a272513422b53"} +{"datasource":"eosc________::prace::prace.patc","master":"10|eosc________::1ab1b123bd559ee7f7c7ec2ee353f0c0"} +{"datasource":"eosc________::prace::prace.preparatory_access","master":"10|eosc________::39430adf529f1ab9e33da444b3708fcf"} +{"datasource":"eosc________::prace::prace.project_access","master":"10|eosc________::b58e957946983b686c76ee19dfab8d70"} +{"datasource":"eosc________::prace::prace.ptc","master":"10|eosc________::b3ca18e8884bfe2422d3723313fef79c"} +{"datasource":"eosc________::prace::prace.seasonal_schools_and_international_summer_school","master":"10|eosc________::590c71318d9d94c32981e3195567d546"} +{"datasource":"eosc________::prace::prace.shape","master":"10|eosc________::38b5a26f74e4808270a2d4f305d2f3a5"} +{"datasource":"eosc________::prace::prace.training_portal","master":"10|eosc________::25966a269ab2343ac9c4d982c341d87f"} +{"datasource":"eosc________::predictia::predictia.climadjust","master":"10|eosc________::14743eb22da3524893784faf409aac70"} +{"datasource":"eosc________::psi::psi.psi_public_data_repository","master":"10|re3data_____::1e55174ff77ed2d804871281201dbb50"} +{"datasource":"eosc________::psi::psi.remote_desktop_service","master":"10|eosc________::c82e26eb6e65d008de03b349dffc11fc"} +{"datasource":"eosc________::psnc::psnc.rohub","master":"10|eosc________::c87f08707b5235172e85b374e39a82dc"} +{"datasource":"eosc________::psnc::psnc.symbiote","master":"10|eosc________::ef0cd965a0d0a3df80ecfae4b3b08aad"} +{"datasource":"eosc________::rasdaman::rasdaman.datacube","master":"10|eosc________::bb1678f7b15d8c15fde6e240a4f95f93"} +{"datasource":"eosc________::rbi::rbi.dariah_science_gateway","master":"10|eosc________::b51b448421d926293b3781f4ac90f4f4"} +{"datasource":"eosc________::readcoop::readcoop.transkribus","master":"10|eosc________::a80411026809e6eaa896439e1b9764f4"} +{"datasource":"eosc________::rli::rli.open_energy_platform","master":"10|fairsharing_::0cbed40c0d920b94126eaf5e707be1f5"} +{"datasource":"eosc________::ror-org::ror-org.identifier","master":"10|eosc________::6fe92c2346db22322ddf6b677d449b0e"} +{"datasource":"eosc________::sciences_po::sciences_po.ethnic_and_migrant_minority_survey_registry","master":"10|eosc________::0cde986dc2bf015912e407f0f83ee402"} +{"datasource":"eosc________::sciences_po::sciences_po.wpss_for_ess","master":"10|eosc________::9a5bb11c495443aad944b04f5fcb5c07"} +{"datasource":"eosc________::scigne::scigne.cloud_compute","master":"10|eosc________::7c63e3284c36b5977c553192dce506b3"} +{"datasource":"eosc________::scipedia::scipedia.scipedia","master":"10|eosc________::850abcddc76069f2c3c1cf77ad4beec9"} +{"datasource":"eosc________::scipedia::scipedia.topos_for_individuals","master":"10|eosc________::e6214b58f39a25b53eecda340f95ee7b"} +{"datasource":"eosc________::seadatanet::seadatanet.doi_minting_service","master":"10|eosc________::f87f72147a3c82c4f77684e40101e90e"} +{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_environmental_data_edmed","master":"10|eosc________::d79706389f0b864306feb47aac1f5766"} +{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_environmental_research_projects","master":"10|eosc________::baa9d2d6cdd8507fcbf76242e4c25d76"} +{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_organisations_edmo","master":"10|eosc________::5d23c66c26e0df209fc415c1e9ad0316"} +{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_the_cruise_summary_reports_csr","master":"10|eosc________::fd70912c66037dc11f710587e281eeaf"} +{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_the_initial_ocean-observing_systems_edios","master":"10|eosc________::846016e987d1feaf2a36083f88dba1f2"} +{"datasource":"eosc________::seadatanet::seadatanet.seadatanet_cdi","master":"10|eosc________::36cd158d6b1bbdbfb443c68b8da00335"} +{"datasource":"eosc________::seadatanet::seadatanet.vocabulary_services_-_underpinned_by_the_nerc_vocabulary_server_nvs","master":"10|eosc________::4416d18ec7a57e553979fbfa4d862483"} +{"datasource":"eosc________::sinergise::sinergise.sentinel_hub","master":"10|eosc________::d36ae944fa207461bcb7b2b3a6c94de8"} +{"datasource":"eosc________::sixsq::sixsq.nuvla_multi-cloud_application_management_platform","master":"10|eosc________::38438cc3190a3815359efb53b9dd98eb"} +{"datasource":"eosc________::sks::sks.digital_production_for_conferences_workshops_roundtables_and_other_academic_and_professional_events","master":"10|eosc________::f6b51bef4a5f1478e980673339f2b2f3"} +{"datasource":"eosc________::smartsmear::smartsmear.smartsmear","master":"10|eosc________::d17a9325ca64ffad59e04659ed5404f7"} +{"datasource":"eosc________::sobigdata::sobigdata.tagme","master":"10|eosc________::0c3b8b80d9d6d38effd28bfa6a140a12"} +{"datasource":"eosc________::suite5::suite5.furniture_enterprise_analytics","master":"10|eosc________::29ed60070bd91bdc19c9f278b104465c"} +{"datasource":"eosc________::switch::switch.switchengines","master":"10|eosc________::d4143918a810115206640cfeb11e0ba6"} +{"datasource":"eosc________::t-systems::t-systems.open_telekom_cloud","master":"10|eosc________::c489ef6564a47922359f7b833919d642"} +{"datasource":"eosc________::terradue::terradue.eo_services_for_earthquake_response_and_landslides_analysis","master":"10|eosc________::ab3140d145deb5fdb02eeefbc5ebc471"} +{"datasource":"eosc________::tib::tib.open_research_knowledge_graph_orkg","master":"10|eosc________::ed6bd695c7a99297f360bc2fc915be90"} +{"datasource":"eosc________::ubora::ubora.ubora","master":"10|eosc________::bacf05aff1c6dcf3133a0352d5eb14c4"} +{"datasource":"eosc________::ubora::ubora.ubora_e-platform","master":"10|eosc________::947fde33605ba61216a07135ee1551f2"} +{"datasource":"eosc________::ugr-es::ugr-es.glacier_lagoons_of_sierra_nevada","master":"10|eosc________::8a966c0efca298ad5ec130d323c29935"} +{"datasource":"eosc________::uit::uit.dataverseno","master":"10|eosc________::92b76aa81a5b8443fcf17d3ae3c34211"} +{"datasource":"eosc________::uit::uit.the_troms_repository_of_language_and_linguistics_trolling","master":"10|fairsharing_::a36b0dcd1e6384abc0e1867860ad3ee3"} +{"datasource":"eosc________::ukaea::ukaea.prominence","master":"10|eosc________::06ce999c7cf77ea5a65f87bb563cd625"} +{"datasource":"eosc________::ukri_-_stfc::ukri_-_stfc.cvmfs_test","master":"10|eosc________::53aaa0a24d0edc47c23e722135c29dde"} +{"datasource":"eosc________::ukri_-_stfc::ukri_-_stfc.rucio","master":"10|eosc________::c19a8251c6bf563365c555572ace903e"} +{"datasource":"eosc________::uni-freiburg::uni-freiburg.european_galaxy_server","master":"10|eosc________::cc00fc2385475b80accec001dfb85efb"} +{"datasource":"eosc________::unibo::unibo.opencitations","master":"10|eosc________::573c29ecaf76ab961743bfc8a7d911ec"} +{"datasource":"eosc________::unifl::unifl.snap4city","master":"10|eosc________::9a55c40c3c082b7a8352ecbc56a87996"} +{"datasource":"eosc________::unige::unige.astronomical_online_data_analysis_astrooda","master":"10|eosc________::63f6119d3170cccf979daada3c5b524e"} +{"datasource":"eosc________::unitartu::unitartu.ut.rocket","master":"10|eosc________::da3450589a9d56212963b20cf729974c"} +{"datasource":"eosc________::upv-es::upv-es.lemonade","master":"10|eosc________::afdd227beada491f77d7944d7a0eafc9"} +{"datasource":"eosc________::vamdc::vamdc.portal","master":"10|eosc________::4dab2bb6e9a9ad223cd63c62c2ea804e"} +{"datasource":"eosc________::vamdc::vamdc.query_store","master":"10|eosc________::33f18bfe544c3c84ac28be6a3292d166"} +{"datasource":"eosc________::vamdc::vamdc.species_database","master":"10|eosc________::ae3587682dec5663a1b3b625036d15d0"} +{"datasource":"eosc________::vilnius-university::vilnius-university.the_national_open_access_research_data_archive_midas","master":"10|eosc________::4987ee0d071f68cf88f6b1a834b6733f"} +{"datasource":"eosc________::wenmr::wenmr.amber-based_portal_server_for_nmr_structures_amps-nmr","master":"10|eosc________::c6cca9747ef3ce296bd626bcbc4e480a"} +{"datasource":"eosc________::wenmr::wenmr.disvis_web_portal","master":"10|eosc________::2539ec693b683284c4e243b969ae3fc0"} +{"datasource":"eosc________::wenmr::wenmr.fanten_finding_anisotropy_tensor","master":"10|eosc________::99c793e3f3b856c48eaaa36682038b28"} +{"datasource":"eosc________::wenmr::wenmr.haddock24_web_portal","master":"10|eosc________::0f198f6a0885105809f420be23614be3"} +{"datasource":"eosc________::wenmr::wenmr.metalpdb","master":"10|eosc________::84676bc3d2ce17de70309dc58f428296"} +{"datasource":"eosc________::wenmr::wenmr.pdb-tools_web","master":"10|eosc________::b37eed45624ac30f3476f71640e59a61"} +{"datasource":"eosc________::wenmr::wenmr.powerfit_web_portal","master":"10|eosc________::93d4d621ed1da378c0e7dc891cefc007"} +{"datasource":"eosc________::wenmr::wenmr.spoton","master":"10|eosc________::76e7e0552f9c6b89db94b31ddc366b9f"} \ No newline at end of file From 627332526b4a82b09190d3f3aefd106e274e38dd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jul 2022 14:55:11 +0200 Subject: [PATCH 13/32] [EOSC context TAG] workflow start from reset_outputpath action --- .../resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 6016e0179e..17fc76aad4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -44,7 +44,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -254,7 +254,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} - + From 06a95daf60c49d9b0781d0e1f37965fc804a6491 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jul 2022 14:57:06 +0200 Subject: [PATCH 14/32] [EOSC context TAG] refactoring after compilation --- .../dhp/bulktag/eosc/SparkEoscBulkTag.java | 6 +- .../dhp/bulktag/EOSCContextTaggingTest.java | 224 +++++++++--------- 2 files changed, 118 insertions(+), 112 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java index 600a5cec8e..3ed910184c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -93,13 +93,9 @@ public class SparkEoscBulkTag implements Serializable { String datasourceMapPath, Class resultClazz) { - List hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class) .map((MapFunction) dm -> dm.getMaster(), Encoders.STRING()) - .collectAsList(); - - - + .collectAsList(); readPath(spark, inputPath, resultClazz) .map(patchResult(), Encoders.bean(resultClazz)) diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java index d6785acc7d..cbdab7628a 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCContextTaggingTest.java @@ -1,16 +1,11 @@ package eu.dnetlib.dhp.bulktag; -/** - * @author miriam.baglioni - * @Date 22/07/22 - */ -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -26,10 +21,17 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; +/** + * @author miriam.baglioni + * @Date 22/07/22 + */ +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; //"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc //"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc @@ -37,116 +39,124 @@ import java.util.List; //"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag public class EOSCContextTaggingTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(EOSCContextTaggingTest.class); + private static final Logger log = LoggerFactory.getLogger(EOSCContextTaggingTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(EOSCContextTaggingTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(EOSCContextTaggingTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(EOSCContextTaggingTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(EOSCContextTaggingTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(EOSCTagJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(EOSCTagJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - void EoscContextTagTest() throws Exception { + @Test + void EoscContextTagTest() throws Exception { - spark - .read() - .textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath()) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, Dataset.class), - Encoders.bean(Dataset.class)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(workingDir.toString() + "/input/dataset"); + spark + .read() + .textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath()) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, Dataset.class), + Encoders.bean(Dataset.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingDir.toString() + "/input/dataset"); + SparkEoscBulkTag + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", + workingDir.toString() + "/input/dataset", + "-workingPath", workingDir.toString() + "/working/dataset", + "-datasourceMapPath", + getClass() + .getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster") + .getPath(), + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset" + }); - SparkEoscBulkTag - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", - workingDir.toString() + "/input/dataset", - "-workingPath", workingDir.toString() + "/working/dataset", - "-datasourceMapPath", getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster").getPath(), - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset" - }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - JavaRDD tmp = sc - .textFile(workingDir.toString() + "/input/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + Assertions.assertEquals(10, tmp.count()); - Assertions.assertEquals(10, tmp.count()); + Assertions + .assertEquals( + 4, + tmp + .filter( + s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); - Assertions - .assertEquals( - 4, - tmp - .filter( - s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) - .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + d -> d.getId().equals("50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + d -> d.getId().equals("50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); - Assertions - .assertEquals(1, - tmp - .filter(d -> d.getId().equals("50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea") - && - d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() - ); - Assertions - .assertEquals(1, - tmp - .filter(d -> d.getId().equals("50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1") - && - d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() - ); - - - Assertions - .assertEquals(1, - tmp - .filter(d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") - && - d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() - ); - - Assertions - .assertEquals(1, - tmp - .filter(d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") - && - d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))).count() - ); - } + Assertions + .assertEquals( + 1, + tmp + .filter( + d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") + && + d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) + .count()); + } } From d091866e48ce0f814256c2f40ee5eddeb60511a3 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 25 Jul 2022 11:12:22 +0200 Subject: [PATCH 15/32] [EOSC Context Tagging] refactoring --- .../dhp/bulktag/eosc/SparkEoscBulkTag.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java index 3ed910184c..99ff922232 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -121,14 +121,14 @@ public class SparkEoscBulkTag implements Serializable { (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) { Context context = new Context(); context.setId("eosc"); - OafMapperUtils - .dataInfo( - false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils - .qualifier( - CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, - DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), - TAGGING_TRUST); + context.setDataInfo(Arrays.asList(OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST))); value.getContext().add(context); } From 144c103b672c3165f8af4bc27f5515f620b3c7d2 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 25 Jul 2022 13:52:45 +0200 Subject: [PATCH 16/32] [EOSC Context Tagging] add check to avoid the insertion of the context if already present --- .../java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java index 99ff922232..24520ff26d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -117,8 +117,9 @@ public class SparkEoscBulkTag implements Serializable { } private static R enrich(R value, List hostedByList) { - if (value.getInstance().stream().anyMatch(i -> hostedByList.contains(i.getHostedby().getKey())) || - (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) { + if (value.getInstance().stream().anyMatch(i -> (hostedByList.contains(i.getHostedby().getKey())) || + (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) && + !value.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) { Context context = new Context(); context.setId("eosc"); context.setDataInfo(Arrays.asList(OafMapperUtils From 0172bab25136b1a3f974c2d66ddee434a712e26f Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 25 Jul 2022 14:16:45 +0200 Subject: [PATCH 17/32] [EOSC Context Tagging] refactoring --- .../dhp/bulktag/eosc/SparkEoscBulkTag.java | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java index 24520ff26d..66e0c8af4f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscBulkTag.java @@ -117,19 +117,28 @@ public class SparkEoscBulkTag implements Serializable { } private static R enrich(R value, List hostedByList) { - if (value.getInstance().stream().anyMatch(i -> (hostedByList.contains(i.getHostedby().getKey())) || - (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) && - !value.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) { + if (value + .getInstance() + .stream() + .anyMatch( + i -> (hostedByList.contains(i.getHostedby().getKey())) || + (value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0)) + && + !value.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) { Context context = new Context(); context.setId("eosc"); - context.setDataInfo(Arrays.asList(OafMapperUtils - .dataInfo( - false, BULKTAG_DATA_INFO_TYPE, true, false, + context + .setDataInfo( + Arrays + .asList( OafMapperUtils - .qualifier( + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), - TAGGING_TRUST))); + TAGGING_TRUST))); value.getContext().add(context); } From 1c82acb16835ea7ce3f70f8c847204f0ac487708 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 25 Jul 2022 14:26:39 +0200 Subject: [PATCH 18/32] [EOSC Context Tagging] refactoring: moved EOSC IF tagging in package eosc under bulkTag --- .../eu/dnetlib/dhp/bulktag/{ => eosc}/SparkEoscTag.java | 7 +------ .../eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml | 2 +- .../test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java | 5 +---- 3 files changed, 3 insertions(+), 11 deletions(-) rename dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/{ => eosc}/SparkEoscTag.java (96%) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java similarity index 96% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java index 730e8a3fe7..7364966415 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java @@ -1,13 +1,10 @@ -package eu.dnetlib.dhp.bulktag; +package eu.dnetlib.dhp.bulktag.eosc; import static eu.dnetlib.dhp.PropagationConstant.readPath; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.*; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -21,9 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 17fc76aad4..9c1bbdf721 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -226,7 +226,7 @@ yarn-cluster cluster EOSC_tagging - eu.dnetlib.dhp.bulktag.SparkEoscTag + eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag dhp-enrichment-${projectVersion}.jar --num-executors=${sparkExecutorNumber} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 5f47da10e5..17e570fdf4 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -1,21 +1,18 @@ package eu.dnetlib.dhp.bulktag; -import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR; - import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag; import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; From 35bcd9422d60be622a0dd6f71a4a5be03513e07b Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 25 Jul 2022 15:45:22 +0200 Subject: [PATCH 19/32] [EOSC Context Tagging] removed not needed specification in path --- .../eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java index 9ad108749f..e9b1d3cfd8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/ReadMasterDatasourceFromDB.java @@ -62,7 +62,7 @@ public class ReadMasterDatasourceFromDB implements Closeable { final String dbUrl = parser.get("postgresUrl"); final String dbUser = parser.get("postgresUser"); final String dbPassword = parser.get("postgresPassword"); - final String hdfsPath = parser.get("hdfsPath") + "/datasourceMasters"; + final String hdfsPath = parser.get("hdfsPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); try ( From d43663d30f6a3039a28ad8a6ac9723dd3fce4e9c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 25 Jul 2022 17:54:10 +0200 Subject: [PATCH 20/32] adapted RorActionSet test, it should not create parent/child rels --- .../ror/GenerateRorActionSetJobTest.java | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJobTest.java index d50c1d5f39..5736bd95ed 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJobTest.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.actionmanager.ror; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.FileInputStream; import java.util.List; @@ -38,25 +39,20 @@ class GenerateRorActionSetJobTest { .readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class); final List> aas = GenerateRorActionSetJob.convertRorOrg(r); - Assertions.assertEquals(3, aas.size()); + Assertions.assertEquals(1, aas.size()); assertEquals(Organization.class, aas.get(0).getClazz()); - assertEquals(Relation.class, aas.get(1).getClazz()); - assertEquals(Relation.class, aas.get(2).getClazz()); final Organization o = (Organization) aas.get(0).getPayload(); - final Relation r1 = (Relation) aas.get(1).getPayload(); - final Relation r2 = (Relation) aas.get(2).getPayload(); - assertEquals(o.getId(), r1.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertEquals(ModelConstants.IS_PARENT_OF, r1.getRelClass()); - assertEquals(ModelConstants.IS_CHILD_OF, r2.getRelClass()); + assertNotNull(o); + + assertNotNull(o.getCountry()); + assertEquals("AU", o.getCountry().getClassid()); + + assertNotNull(o.getLegalname()); + assertEquals("Mount Stromlo Observatory", o.getLegalname().getValue()); System.out.println(mapper.writeValueAsString(o)); - System.out.println(mapper.writeValueAsString(r1)); - System.out.println(mapper.writeValueAsString(r2)); - } @Test From 67525076ec9ec4ff7b190a0bc1a15b5a31333388 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 26 Jul 2022 15:35:17 +0200 Subject: [PATCH 21/32] fixed test, now it compiles after commit a6977197b3925c520f76961cfff7b05c37a51d6f --- .../dhp/datacite/DataciteToOAFTest.scala | 17 +++++++--- .../dhp/datacite/DataciteUtilityTest.scala | 31 +++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index 31784c7e92..68230b4778 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -2,11 +2,14 @@ package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest -import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import org.apache.commons.io.FileUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.{col, count} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} @@ -45,6 +48,9 @@ class DataciteToOAFTest extends AbstractVocabularyTest { } + + + @Test def testConvert(): Unit = { @@ -70,17 +76,18 @@ class DataciteToOAFTest extends AbstractVocabularyTest { assertEquals(100, nativeSize) - spark.read.load(targetPath).printSchema(); + val result: Dataset[String] = spark.read.text(targetPath).as[String].map(DataciteUtilityTest.convertToOAF)(Encoders.STRING) + + + - val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] result - .map(s => s.getClass.getSimpleName) .groupBy(col("value").alias("class")) .agg(count("value").alias("Total")) .show(false) - val t = spark.read.load(targetPath).count() + val t = spark.read.text(targetPath).as[String].count() assertTrue(t > 0) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala new file mode 100644 index 0000000000..04d3c4a582 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.datacite + +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +object DataciteUtilityTest { + + def convertToOAF(input:String) : String = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + + val isRelation:String = (json \\ "source").extractOrElse("NULL") + + if (isRelation != "NULL") { + return "Relation" + } + + val iType: List[String] = for { + JObject(instance) <- json \\ "instance" + JField("instancetype", JObject(instancetype)) <- instance + JField("classname", JString(classname)) <- instancetype + + } yield classname + + val l:String =iType.head.toLowerCase() + l + } + +} From ed98a6d9d04aafd8ed32212e3e77040d1f8338e0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 28 Jul 2022 10:15:14 +0200 Subject: [PATCH 22/32] [Datacite mapping] include the older datacite prefixed OpenAIRE id among the originalId[] --- .../dnetlib/dhp/datacite/DataciteToOAFTransformation.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index ff966aaea1..a7863d1449 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -584,7 +584,12 @@ object DataciteToOAFTransformation { JField("awardUri", JString(awardUri)) <- fundingReferences } yield awardUri + val oid = result.getId result.setId(IdentifierFactory.createIdentifier(result)) + if (!result.getId.equalsIgnoreCase(oid)) { + result.getOriginalId.add(oid) + } + var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) From 1dd1e4fe3a224935d3c021f09ae4284e2a22b29e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 28 Jul 2022 11:27:08 +0200 Subject: [PATCH 23/32] extended test for mapping project_organization relations --- .../raw/MigrateDbEntitiesApplication.java | 2 +- .../raw/MigrateDbEntitiesApplicationTest.java | 62 ++++++++++++++----- .../projectorganization_resultset_entry.json | 17 +++-- 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 8296e99cd2..f40d3d19b1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -440,7 +440,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final List properties = Lists .newArrayList( - keyValue("contribution", String.valueOf(rs.getDouble("totalcost"))), + keyValue("contribution", String.valueOf(rs.getDouble("contribution"))), keyValue("currency", rs.getString("currency"))); final Relation r1 = OafMapperUtils diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 9048a22ea9..06947103ab 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -1,9 +1,7 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.lenient; @@ -31,11 +29,12 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; @ExtendWith(MockitoExtension.class) -public class MigrateDbEntitiesApplicationTest { +class MigrateDbEntitiesApplicationTest { private MigrateDbEntitiesApplication app; @@ -61,7 +60,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessService() throws Exception { + void testProcessService() throws Exception { final List fields = prepareMocks("services_resultset_entry.json"); final List list = app.processService(rs); @@ -74,7 +73,7 @@ public class MigrateDbEntitiesApplicationTest { .getCollectedfrom() .stream() .map(KeyValue::getKey) - .forEach(dsId -> assertValidId(dsId)); + .forEach(this::assertValidId); assertEquals(1, ds.getPid().size()); assertEquals("r3d100010218", ds.getPid().get(0).getValue()); @@ -163,14 +162,14 @@ public class MigrateDbEntitiesApplicationTest { .stream() .map(Qualifier::getSchemeid) .collect(Collectors.toCollection(HashSet::new)); - assertTrue(cpSchemeId.size() == 1); + assertEquals(1,cpSchemeId.size()); assertTrue(cpSchemeId.contains("eosc:contentpolicies")); HashSet cpSchemeName = ds .getContentpolicies() .stream() .map(Qualifier::getSchemename) .collect(Collectors.toCollection(HashSet::new)); - assertTrue(cpSchemeName.size() == 1); + assertEquals(1, cpSchemeName.size()); assertTrue(cpSchemeName.contains("eosc:contentpolicies")); assertEquals(2, ds.getContentpolicies().size()); assertEquals("Taxonomic classification", ds.getContentpolicies().get(0).getClassid()); @@ -193,7 +192,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessProject() throws Exception { + void testProcessProject() throws Exception { final List fields = prepareMocks("projects_resultset_entry.json"); final List list = app.processProject(rs); @@ -211,7 +210,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessOrganization() throws Exception { + void testProcessOrganization() throws Exception { final List fields = prepareMocks("organizations_resultset_entry.json"); final List list = app.processOrganization(rs); @@ -238,7 +237,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessDatasourceOrganization() throws Exception { + void testProcessDatasourceOrganization() throws Exception { final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); final List list = app.processServiceOrganization(rs); @@ -255,7 +254,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessProjectOrganization() throws Exception { + void testProcessProjectOrganization() throws Exception { final List fields = prepareMocks("projectorganization_resultset_entry.json"); final List list = app.processProjectOrganization(rs); @@ -271,6 +270,38 @@ public class MigrateDbEntitiesApplicationTest { assertEquals(r2.getSource(), r1.getTarget()); assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey()); + + assertEquals(ModelConstants.PROJECT_ORGANIZATION, r1.getRelType()); + assertEquals(ModelConstants.PROJECT_ORGANIZATION, r2.getRelType()); + + assertEquals(ModelConstants.PARTICIPATION, r1.getSubRelType()); + assertEquals(ModelConstants.PARTICIPATION, r2.getSubRelType()); + + if (r1.getSource().startsWith("40")) { + assertEquals(ModelConstants.HAS_PARTICIPANT, r1.getRelClass()); + assertEquals(ModelConstants.IS_PARTICIPANT, r2.getRelClass()); + } else if (r1.getSource().startsWith("20")) { + assertEquals(ModelConstants.IS_PARTICIPANT, r1.getRelClass()); + assertEquals(ModelConstants.HAS_PARTICIPANT, r2.getRelClass()); + } + + assertNotNull(r1.getProperties()); + checkProperty(r1, "contribution", "436754.0"); + checkProperty(r2, "contribution", "436754.0"); + + checkProperty(r1, "currency","EUR"); + checkProperty(r2, "currency", "EUR"); + } + + private void checkProperty(Relation r, String property, String value) { + final List p = r + .getProperties() + .stream() + .filter(kv -> kv.getKey().equals(property)) + .collect(Collectors.toList()); + assertFalse(p.isEmpty()); + assertEquals(1, p.size()); + assertEquals(value, p.get(0).getValue()); } @Test @@ -289,7 +320,7 @@ public class MigrateDbEntitiesApplicationTest { } @Test - public void testProcessClaims_rels() throws Exception { + void testProcessClaims_rels() throws Exception { final List fields = prepareMocks("claimsrel_resultset_entry.json"); final List list = app.processClaims(rs); @@ -320,9 +351,6 @@ public class MigrateDbEntitiesApplicationTest { assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey()); - - // System.out.println(new ObjectMapper().writeValueAsString(r1)); - // System.out.println(new ObjectMapper().writeValueAsString(r2)); } private List prepareMocks(final String jsonFile) throws IOException, SQLException { @@ -385,7 +413,7 @@ public class MigrateDbEntitiesApplicationTest { final String[] values = ((List) tf.getValue()) .stream() .filter(Objects::nonNull) - .map(o -> o.toString()) + .map(Object::toString) .toArray(String[]::new); Mockito.when(arr.getArray()).thenReturn(values); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json index a3305926df..02cebae385 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/projectorganization_resultset_entry.json @@ -2,12 +2,12 @@ { "field": "project", "type": "string", - "value": "nsf_________::1700003" + "value": "corda__h2020::824273" }, { "field": "resporganization", "type": "string", - "value": "nsf_________::University_of_Notre_Dame" + "value": "corda__h2020::999945647" }, { "field": "participantnumber", @@ -16,8 +16,13 @@ }, { "field": "contribution", - "type": "not_used", - "value": null + "type": "double", + "value": 436754 + }, + { + "field": "currency", + "type": "string", + "value": "EUR" }, { "field": "startdate", @@ -52,12 +57,12 @@ { "field": "collectedfromid", "type": "string", - "value": "openaire____::nsf" + "value": "openaire____::corda_h2020" }, { "field": "collectedfromname", "type": "string", - "value": "NSF - National Science Foundation" + "value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020" }, { "field": "semantics", From 3329b6ce6bdccdab25c4bb900cc8ad4f78d8c77c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 29 Jul 2022 10:54:20 +0200 Subject: [PATCH 24/32] [EOSC TAG] added fix for NPE on subjects --- .../src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java index 7364966415..a96e6a3bc8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java @@ -183,6 +183,8 @@ public class SparkEoscTag { } private static boolean containsCriteriaNotebook(Software s) { + if(!Optional.ofNullable(s.getSubject()).isPresent()) + return false; if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter"))) return true; if (s From 0727f0ef481f783dbfcfd670bd4d91ee88a34f7a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 29 Jul 2022 11:55:34 +0200 Subject: [PATCH 25/32] [EOSC tag] avoid NPEs --- .../dhp/bulktag/eosc/SparkEoscTag.java | 41 +++++++++++-------- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 2 +- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java index a96e6a3bc8..c131399cc0 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/eosc/SparkEoscTag.java @@ -23,6 +23,10 @@ import eu.dnetlib.dhp.schema.oaf.*; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static final String EOSC_GALAXY_WORKFLOW = "EOSC::Galaxy Workflow"; + public static final String EOSC_TWITTER_DATA = "EOSC::Twitter Data"; + public static final String EOSC_JUPYTER_NOTEBOOK = "EOSC::Jupyter Notebook"; + public static final String COMPLIES_WITH = "compliesWith"; public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -76,8 +80,8 @@ public class SparkEoscTag { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); addEIG( - s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", - "compliesWith"); + s.getEoscifguidelines(), EOSC_JUPYTER_NOTEBOOK, EOSC_JUPYTER_NOTEBOOK, "", + COMPLIES_WITH); } if (containsCriteriaGalaxy(s)) { @@ -85,7 +89,7 @@ public class SparkEoscTag { s.setEoscifguidelines(new ArrayList<>()); addEIG( - s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + s.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "", COMPLIES_WITH); } return s; }, Encoders.bean(Software.class)) @@ -108,11 +112,11 @@ public class SparkEoscTag { if (containsCriteriaGalaxy(orp)) { addEIG( - orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", - "compliesWith"); + orp.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "", + COMPLIES_WITH); } if (containscriteriaTwitter(orp)) { - addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(orp.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -133,7 +137,7 @@ public class SparkEoscTag { if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(d.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH); } return d; }, Encoders.bean(Dataset.class)) @@ -163,10 +167,12 @@ public class SparkEoscTag { (words.contains("data") || words.contains("dataset"))) return true; - if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) && - r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data"))) - return true; - return false; + return Optional + .ofNullable(r.getSubject()) + .map( + s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) && + s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data"))) + .orElse(false); } private static boolean containsCriteriaGalaxy(Result r) { @@ -176,14 +182,16 @@ public class SparkEoscTag { words.contains("workflow")) return true; - if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) && - r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) - return true; - return false; + return Optional + .ofNullable(r.getSubject()) + .map( + s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) && + s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) + .orElse(false); } private static boolean containsCriteriaNotebook(Software s) { - if(!Optional.ofNullable(s.getSubject()).isPresent()) + if (!Optional.ofNullable(s.getSubject()).isPresent()) return false; if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter"))) return true; @@ -225,6 +233,5 @@ public class SparkEoscTag { Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); return words; - } } diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 17e570fdf4..bfe4f64482 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -6,7 +6,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -24,6 +23,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag; import eu.dnetlib.dhp.schema.oaf.*; public class EOSCTagJobTest { From f62c4e05cd239830628154dc09fb7d386f135201 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 29 Jul 2022 11:56:01 +0200 Subject: [PATCH 26/32] code formatting --- .../dnetlib/dhp/sx/bio/pubmed/PMArticle.java | 1 - .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 40 +++++++-------- .../dhp/datacite/DataciteToOAFTest.scala | 10 +--- .../dhp/datacite/DataciteUtilityTest.scala | 11 ++-- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 51 ++++++++----------- .../raw/MigrateDbEntitiesApplicationTest.java | 12 ++--- 6 files changed, 53 insertions(+), 72 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index 9287a8cdd6..3fb814606a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -255,7 +255,6 @@ public class PMArticle implements Serializable { return grants; } - public String getPmcId() { return pmcId; } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 24a1fa62b9..42bafc93eb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -56,13 +56,11 @@ object PubMedToOaf { null } - - def createOriginalOpenaireId(article:PMArticle) :String = { + def createOriginalOpenaireId(article: PMArticle): String = { if (StringUtils.isNotEmpty(article.getPmcId)) { - val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC","")}") + val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC", "")}") s"$OLD_PMC_PREFIX$md5" - } - else + } else null } @@ -142,26 +140,24 @@ object PubMedToOaf { val pidList = ListBuffer[StructuredProperty]() pidList += OafMapperUtils.structuredProperty( - article.getPmid, - PidType.pmid.toString, - PidType.pmid.toString, + article.getPmid, + PidType.pmid.toString, + PidType.pmid.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + + if (StringUtils.isNotBlank(article.getPmcId)) { + pidList += OafMapperUtils.structuredProperty( + article.getPmcId, + PidType.pmc.toString, + PidType.pmc.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo ) - - - if (StringUtils.isNotBlank(article.getPmcId)) - { - pidList += OafMapperUtils.structuredProperty( - article.getPmcId, - PidType.pmc.toString, - PidType.pmc.toString, - ModelConstants.DNET_PID_TYPES, - ModelConstants.DNET_PID_TYPES, - dataInfo - ) - } + } if (pidList == null) return null @@ -297,7 +293,7 @@ object PubMedToOaf { if (StringUtils.isNotEmpty(article.getPmcId)) { val originalIDS = ListBuffer[String]() originalIDS += createOriginalOpenaireId(article) - pidList.map(s => s.getValue).foreach(p =>originalIDS += p) + pidList.map(s => s.getValue).foreach(p => originalIDS += p) result.setOriginalId(originalIDS.asJava) } else result.setOriginalId(pidList.map(s => s.getValue).asJava) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index 68230b4778..48da049da6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -48,9 +48,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest { } - - - @Test def testConvert(): Unit = { @@ -76,11 +73,8 @@ class DataciteToOAFTest extends AbstractVocabularyTest { assertEquals(100, nativeSize) - val result: Dataset[String] = spark.read.text(targetPath).as[String].map(DataciteUtilityTest.convertToOAF)(Encoders.STRING) - - - - + val result: Dataset[String] = + spark.read.text(targetPath).as[String].map(DataciteUtilityTest.convertToOAF)(Encoders.STRING) result .groupBy(col("value").alias("class")) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala index 04d3c4a582..942e0958ef 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteUtilityTest.scala @@ -6,25 +6,24 @@ import org.json4s.jackson.JsonMethods.parse object DataciteUtilityTest { - def convertToOAF(input:String) : String = { + def convertToOAF(input: String): String = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) - - val isRelation:String = (json \\ "source").extractOrElse("NULL") + val isRelation: String = (json \\ "source").extractOrElse("NULL") if (isRelation != "NULL") { return "Relation" } val iType: List[String] = for { - JObject(instance) <- json \\ "instance" + JObject(instance) <- json \\ "instance" JField("instancetype", JObject(instancetype)) <- instance - JField("classname", JString(classname)) <- instancetype + JField("classname", JString(classname)) <- instancetype } yield classname - val l:String =iType.head.toLowerCase() + val l: String = iType.head.toLowerCase() l } diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index b021e5e078..827d23e720 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -76,12 +76,11 @@ class BioScholixTest extends AbstractVocabularyTest { } - - private def checkPMArticle(article:PMArticle): Unit = { + private def checkPMArticle(article: PMArticle): Unit = { assertNotNull(article.getPmid) assertNotNull(article.getTitle) assertNotNull(article.getAuthors) - article.getAuthors.asScala.foreach{a => + article.getAuthors.asScala.foreach { a => assertNotNull(a) assertNotNull(a.getFullName) } @@ -89,20 +88,21 @@ class BioScholixTest extends AbstractVocabularyTest { } @Test - def testParsingPubmedXML():Unit = { - val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + def testParsingPubmedXML(): Unit = { + val xml = new XMLEventReader( + Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) + ) val parser = new PMParser(xml) parser.foreach(checkPMArticle) } - - private def checkPubmedPublication(o:Oaf): Unit = { + private def checkPubmedPublication(o: Oaf): Unit = { assertTrue(o.isInstanceOf[Publication]) - val p:Publication = o.asInstanceOf[Publication] + val p: Publication = o.asInstanceOf[Publication] assertNotNull(p.getId) assertNotNull(p.getTitle) - p.getTitle.asScala.foreach(t =>assertNotNull(t.getValue)) - p.getAuthor.asScala.foreach(a =>assertNotNull(a.getFullname)) + p.getTitle.asScala.foreach(t => assertNotNull(t.getValue)) + p.getAuthor.asScala.foreach(a => assertNotNull(a.getFullname)) assertNotNull(p.getInstance()) p.getInstance().asScala.foreach { i => assertNotNull(i.getCollectedfrom) @@ -112,28 +112,26 @@ class BioScholixTest extends AbstractVocabularyTest { assertNotNull(p.getOriginalId) p.getOriginalId.asScala.foreach(oId => assertNotNull(oId)) - - val hasPMC = p.getInstance().asScala.exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString))) - - + val hasPMC = p + .getInstance() + .asScala + .exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString))) if (hasPMC) { assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::"))) } } - @Test - def testPubmedOriginalID():Unit = { - val article:PMArticle = new PMArticle - + def testPubmedOriginalID(): Unit = { + val article: PMArticle = new PMArticle article.setPmid("1234") article.setTitle("a Title") // VERIFY PUBLICATION IS NOT NULL - article.getPublicationTypes.add( new PMSubject("article",null, null)) + article.getPublicationTypes.add(new PMSubject("article", null, null)) var publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication] assertNotNull(publication) assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId) @@ -146,30 +144,25 @@ class BioScholixTest extends AbstractVocabularyTest { // VERIFY ORIGINAL ID GENERATE IN OLD WAY USING PMC IDENTIFIER EXISTS - - val oldOpenaireID ="od_______267::0000072375bc0e68fa09d4e6b7658248" + val oldOpenaireID = "od_______267::0000072375bc0e68fa09d4e6b7658248" val hasOldOpenAIREID = publication.getOriginalId.asScala.exists(o => o.equalsIgnoreCase(oldOpenaireID)) assertTrue(hasOldOpenAIREID) } - @Test - def testPubmedMapping() :Unit = { + def testPubmedMapping(): Unit = { - val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + val xml = new XMLEventReader( + Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) + ) val parser = new PMParser(xml) val results = ListBuffer[Oaf]() parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) - - - results.foreach(checkPubmedPublication) - - } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 06947103ab..069edc5a60 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -162,7 +162,7 @@ class MigrateDbEntitiesApplicationTest { .stream() .map(Qualifier::getSchemeid) .collect(Collectors.toCollection(HashSet::new)); - assertEquals(1,cpSchemeId.size()); + assertEquals(1, cpSchemeId.size()); assertTrue(cpSchemeId.contains("eosc:contentpolicies")); HashSet cpSchemeName = ds .getContentpolicies() @@ -289,16 +289,16 @@ class MigrateDbEntitiesApplicationTest { checkProperty(r1, "contribution", "436754.0"); checkProperty(r2, "contribution", "436754.0"); - checkProperty(r1, "currency","EUR"); + checkProperty(r1, "currency", "EUR"); checkProperty(r2, "currency", "EUR"); } private void checkProperty(Relation r, String property, String value) { final List p = r - .getProperties() - .stream() - .filter(kv -> kv.getKey().equals(property)) - .collect(Collectors.toList()); + .getProperties() + .stream() + .filter(kv -> kv.getKey().equals(property)) + .collect(Collectors.toList()); assertFalse(p.isEmpty()); assertEquals(1, p.size()); assertEquals(value, p.get(0).getValue()); From 92e48f12f70ce7631a78229f1a4ca4da7ace6e48 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 29 Jul 2022 13:54:00 +0200 Subject: [PATCH 27/32] [metadata collection] updated collector plugin name --- .../main/java/eu/dnetlib/dhp/collection/CollectorWorker.java | 2 +- .../java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 9d94000682..c35fbb497f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -118,7 +118,7 @@ public class CollectorWorker extends ReportingJob { return new RestCollectorPlugin(clientParams); case file: return new FileCollectorPlugin(fileSystem); - case fileGZip: + case fileGzip: return new FileGZipCollectorPlugin(fileSystem); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 08084e22a5..a19ca5c685 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException; public interface CollectorPlugin { enum NAME { - oai, other, rest_json2xml, file, fileGZip; + oai, other, rest_json2xml, file, fileGzip; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb From 9886fe87ecab8d90473a3c62b5cb83c080ec6864 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 29 Jul 2022 16:34:50 +0300 Subject: [PATCH 28/32] - Added FOS classification - Added extra orgs in monitor - Fixed result-project and organization-project tables --- .../oozie_app/scripts/step20-createMonitorDB.sql | 12 +++++++++++- .../dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 4 ++-- .../dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 13 ++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 7412910a91..cc6b69e34e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -38,7 +38,14 @@ create table TARGET.result stored as parquet as 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg - 'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII) + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) + + 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr + 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw + 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly + 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete + 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus + 'openorgs____::4ac562f0376fce3539504567649cb373' -- University of Patras ) )) foo; compute stats TARGET.result; @@ -107,6 +114,9 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; +create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_fos; + create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index dc7c01046e..5461afde6d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -11,13 +11,13 @@ where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r -WHERE r.reltype = 'projectOrganization' +WHERE r.reltype = 'projectOrganization' and r.source like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'resultProject' +WHERE r.reltype = 'resultProject' and r.target like '40|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; create table ${stats_db_name}.project_classification STORED AS PARQUET as diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index b5eba6111b..1514ecf524 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -123,6 +123,16 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +create table ${stats_db_name}.result_fos stored as parquet as +with + lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), + lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), + lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') +select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 +from lvl1 + join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); + CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r @@ -134,4 +144,5 @@ CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; \ No newline at end of file + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; + From 209c7e9dab323932dba65bb88addf2c8ab848d92 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 1 Aug 2022 09:05:35 +0200 Subject: [PATCH 29/32] [datacite] avoid UnsupportedOperationException --- .../eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index a7863d1449..0f82ecadf5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -587,7 +587,7 @@ object DataciteToOAFTransformation { val oid = result.getId result.setId(IdentifierFactory.createIdentifier(result)) if (!result.getId.equalsIgnoreCase(oid)) { - result.getOriginalId.add(oid) + result.setOriginalId((oid::List(doi)).asJava) } var relations: List[Relation] = From 1778d40c40654db9bde707ee2ffdc4e3f922bce9 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 2 Aug 2022 13:39:34 +0300 Subject: [PATCH 30/32] latest version of indicators --- .../scripts/step16-createIndicatorsTables.sql | 993 +++++++++++++----- 1 file changed, 709 insertions(+), 284 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index db40cf9731..25776316ba 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,60 +1,62 @@ ----- Sprint 1 ---- +-- Sprint 1 ---- create table indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p -left outer join ( -select p.id, 1 as green_oa -from publication p -join result_instance ri on ri.id = p.id -join datasource on datasource.id = ri.hostedby -where datasource.type like '%Repository%' -and (ri.accessright = 'Open Access' -or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp -on p.id= tmp.id; + left outer join ( + select p.id, 1 as green_oa + from publication p + join result_instance ri on ri.id = p.id + join datasource on datasource.id = ri.hostedby + where datasource.type like '%Repository%' + and (ri.accessright = 'Open Access' + or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp + on p.id= tmp.id; compute stats indi_pub_green_oa; create table indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p -left outer join ( -select p.id, 1 as grey_lit -from publication p -join result_classifications rt on rt.id = p.id -where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and -not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; + left outer join ( + select p.id, 1 as grey_lit + from publication p + join result_classifications rt on rt.id = p.id + where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and + not exists (select 1 from result_classifications rc where type ='Other literature type' + and rc.id=p.id)) tmp on p.id=tmp.id; compute stats indi_pub_grey_lit; create table indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p -left outer join -(select ri.id, 1 as doi_from_crossref from result_instance ri -join datasource d on d.id = ri.collectedfrom -where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp -on tmp.id=p.id; + left outer join + (select ri.id, 1 as doi_from_crossref from result_instance ri + join datasource d on d.id = ri.collectedfrom + where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp + on tmp.id=p.id; compute stats indi_pub_doi_from_crossref; ----- Sprint 2 ---- + +-- Sprint 2 ---- create table indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r -left outer join (select r.id, license.type as lic from result r -join result_licenses as license on license.id = r.id -where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp -on r.id= tmp.id; + left outer join (select r.id, license.type as lic from result r + join result_licenses as license on license.id = r.id + where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp + on r.id= tmp.id; compute stats indi_result_has_cc_licence; create table indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from result r -left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host -from result r -join result_licenses as license on license.id = r.id -WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp -on r.id= tmp.id; + left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host + from result r + join result_licenses as license on license.id = r.id + WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp + on r.id= tmp.id; compute stats indi_result_has_cc_licence_url; @@ -67,8 +69,8 @@ compute stats indi_pub_has_abstract; create table indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r -left outer join (select id, 1 as has_orcid from result_orcid) tmp -on r.id= tmp.id; + left outer join (select id, 1 as has_orcid from result_orcid) tmp + on r.id= tmp.id; compute stats indi_result_with_orcid; @@ -76,59 +78,89 @@ compute stats indi_result_with_orcid; create table indi_funded_result_with_fundref stored as parquet as select distinct r.id, coalesce(fundref, 0) as fundref from project_results r -left outer join (select distinct id, 1 as fundref from project_results -where provenance='Harvested') tmp -on r.id= tmp.id; + left outer join (select distinct id, 1 as fundref from project_results + where provenance='Harvested') tmp + on r.id= tmp.id; compute stats indi_funded_result_with_fundref; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; +create table indi_result_org_collab stored as parquet as +select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations +from result_organization as o1 + join result_organization as o2 on o1.id=o2.id and o1.organization!=o2.organization +group by o1.organization, o2.organization; --- create table indi_result_org_collab stored as parquet as --- with tmp as --- (select o.id, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id) --- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id --- group by o1.id, o2.id, o1.type; --- --- compute stats indi_result_org_collab; +compute stats indi_result_org_collab; + +create table indi_result_org_country_collab stored as parquet as + with tmp as + (select o.id as id, o.country , ro.id as result,r.type from organization o + join result_organization ro on o.id=ro.organization + join result r on r.id=ro.id where o.country <> 'UNKNOWN') +select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations +from tmp as o1 + join tmp as o2 on o1.result=o2.result +where o1.id<>o2.id and o1.country<>o2.country +group by o1.id, o1.type,o2.country; + +compute stats indi_result_org_country_collab; + +create table indi_result_org_collab stored as parquet as + with tmp as + (select o.id, ro.id as result,r.type from organization o + join result_organization ro on o.id=ro.organization + join result r on r.id=ro.id) +select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations +from tmp as o1 + join tmp as o2 on o1.result=o2.result +where o1.id<>o2.id +group by o1.id, o2.id, o1.type; + +compute stats indi_result_org_collab; + +create table indi_project_collab_org stored as parquet as +select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations +from organization_projects as o1 + join organization_projects as o2 on o1.project=o2.project +where o1.id!=o2.id +group by o1.id, o2.id; + +compute stats indi_project_collab_org; + +create table indi_project_collab_org_country stored as parquet as + with tmp as + (select o.id organization, o.country , ro.project as project from organization o + join organization_projects ro on o.id=ro.id + and o.country <> 'UNKNOWN') +select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations +from tmp as o1 + join tmp as o2 on o1.project=o2.project +where o1.organization<>o2.organization and o1.country<>o2.country +group by o1.organization, o2.country; + +compute stats indi_project_collab_org_country; create table indi_funder_country_collab stored as parquet as -with tmp as (select funder, project, country from organization_projects op -join organization o on o.id=op.id -join project p on p.id=op.project -where country <> 'UNKNOWN') + with tmp as (select funder, project, country from organization_projects op + join organization o on o.id=op.id + join project p on p.id=op.project + where country <> 'UNKNOWN') select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations from tmp as f1 -join tmp as f2 on f1.project=f2.project + join tmp as f2 on f1.project=f2.project where f1.country<>f2.country group by f1.funder, f2.country, f1.country; compute stats indi_funder_country_collab; create table indi_result_country_collab stored as parquet as -with tmp as -(select country, ro.id as result,r.type from organization o -join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id) + with tmp as + (select country, ro.id as result,r.type from organization o + join result_organization ro on o.id=ro.organization + join result r on r.id=ro.id where country <> 'UNKNOWN') select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations from tmp as o1 -join tmp as o2 on o1.result=o2.result + join tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country, o1.type; @@ -138,255 +170,257 @@ compute stats indi_result_country_collab; create table indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd -left outer join ( -select pd.id, 1 as in_diamond_journal from publication_datasources pd -join datasource d on d.id=pd.datasource -join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) -and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp -on pd.id=tmp.id; + left outer join ( + select pd.id, 1 as in_diamond_journal from publication_datasources pd + join datasource d on d.id=pd.datasource + join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp + on pd.id=tmp.id; compute stats indi_pub_diamond; create table indi_pub_hybrid stored as parquet as select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid from publication_datasources pd -left outer join ( -select pd.id, 1 as is_hybrid from publication_datasources pd -join datasource d on d.id=pd.datasource -join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) -and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp -on pd.id=tmp.id; + left outer join ( + select pd.id, 1 as is_hybrid from publication_datasources pd + join datasource d on d.id=pd.datasource + join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp + on pd.id=tmp.id; compute stats indi_pub_hybrid; create table indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd -left outer join ( -select pd.id, 1 as is_transformative from publication_datasources pd -join datasource d on d.id=pd.datasource -join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) -and ps.is_transformative_journal=true) tmp -on pd.id=tmp.id; + left outer join ( + select pd.id, 1 as is_transformative from publication_datasources pd + join datasource d on d.id=pd.datasource + join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + and ps.is_transformative_journal=true) tmp + on pd.id=tmp.id; compute stats indi_pub_in_transformative; create table indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri -left outer join -(select ri.id, 1 as pub_closed_other_open from result_instance ri -join publication p on p.id=ri.id -join datasource d on ri.hostedby=d.id -where d.type like '%Journal%' and ri.accessright='Closed Access' and -(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp -on tmp.id=ri.id; + left outer join + (select ri.id, 1 as pub_closed_other_open from result_instance ri + join publication p on p.id=ri.id + join datasource d on ri.hostedby=d.id + where d.type like '%Journal%' and ri.accessright='Closed Access' and + (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp + on tmp.id=ri.id; compute stats indi_pub_closed_other_open; + ---- Sprint 5 ---- create table indi_result_no_of_copies stored as parquet as select id, count(id) as number_of_copies from result_instance group by id; compute stats indi_result_no_of_copies; + ---- Sprint 6 ---- -create table indi_pub_gold_oa stored as parquet as -WITH gold_oa AS ( - SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn - FROM stats_ext.oa_journals - WHERE issn_1 != "" - UNION ALL - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn - FROM stats_ext.oa_journals - WHERE issn_2 != "" ), -issn AS ( - SELECT * FROM - (SELECT id, issn_printed as issn - FROM datasource WHERE issn_printed IS NOT NULL - UNION - SELECT id, issn_online as issn - FROM datasource WHERE issn_online IS NOT NULL) as issn - WHERE LENGTH(issn) > 7) -SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold -FROM publication_datasources pd -LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_gold FROM publication_datasources pd - JOIN issn on issn.id=pd.datasource - JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; +--create table indi_pub_gold_oa stored as parquet as +--WITH gold_oa AS ( +-- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_1 != "" +-- UNION ALL +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_2 != "" ), +--issn AS ( +-- SELECT * FROM +-- (SELECT id, issn_printed as issn +-- FROM datasource WHERE issn_printed IS NOT NULL +-- UNION +-- SELECT id, issn_online as issn +-- FROM datasource WHERE issn_online IS NOT NULL) as issn +-- WHERE LENGTH(issn) > 7) +--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +--FROM publication_datasources pd +--LEFT OUTER JOIN ( +-- SELECT pd.id, 1 as is_gold FROM publication_datasources pd +-- JOIN issn on issn.id=pd.datasource +-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; -compute stats indi_pub_gold_oa; +--compute stats indi_pub_gold_oa; +-- +--create table indi_datasets_gold_oa stored as parquet as +--WITH gold_oa AS ( +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_1 != "" +-- UNION +-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_2 != "" ), +--issn AS ( +-- SELECT * +-- FROM ( +-- SELECT id,issn_printed as issn +-- FROM datasource +-- WHERE issn_printed IS NOT NULL +-- UNION +-- SELECT id, issn_online as issn +-- FROM datasource +-- WHERE issn_online IS NOT NULL ) as issn +-- WHERE LENGTH(issn) > 7) +--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +--FROM dataset_datasources pd +--LEFT OUTER JOIN ( +-- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd +-- JOIN issn on issn.id=pd.datasource +-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; +-- +--compute stats indi_datasets_gold_oa; -create table indi_datasets_gold_oa stored as parquet as -WITH gold_oa AS ( - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn - FROM stats_ext.oa_journals - WHERE issn_1 != "" - UNION - ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn - FROM stats_ext.oa_journals - WHERE issn_2 != "" ), -issn AS ( - SELECT * - FROM ( - SELECT id,issn_printed as issn - FROM datasource - WHERE issn_printed IS NOT NULL - UNION - SELECT id, issn_online as issn - FROM datasource - WHERE issn_online IS NOT NULL ) as issn - WHERE LENGTH(issn) > 7) -SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold -FROM dataset_datasources pd -LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_gold FROM dataset_datasources pd - JOIN issn on issn.id=pd.datasource - JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - -compute stats indi_datasets_gold_oa; - -create table indi_software_gold_oa stored as parquet as -WITH gold_oa AS ( - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn - FROM stats_ext.oa_journals - WHERE issn_1 != "" - UNION - ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn - FROM stats_ext.oa_journals - WHERE issn_2 != "" ), -issn AS ( - SELECT * - FROM ( - SELECT id,issn_printed as issn - FROM datasource - WHERE issn_printed IS NOT NULL - UNION - SELECT id, issn_online as issn - FROM datasource - WHERE issn_online IS NOT NULL ) as issn - WHERE LENGTH(issn) > 7) -SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold -FROM software_datasources pd -LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_gold FROM software_datasources pd - JOIN issn on issn.id=pd.datasource - JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - -compute stats indi_software_gold_oa; - -create table indi_org_findable stored as parquet as -with result_with_pid as ( - select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro - join result_pids rp on rp.id=ro.id - group by ro.organization), -result_has_abstract as ( - select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro - join result rp on rp.id=ro.id where rp.abstract=true - group by ro.organization), -allresults as ( - select organization, count(distinct id) no_allresults from result_organization - group by organization), -result_with_pid_share as ( - select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share - from allresults - join result_with_pid on result_with_pid.organization=allresults.organization), -result_with_abstract_share as ( - select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share - from allresults - join result_has_abstract on result_has_abstract.organization=allresults.organization) -select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable -from allresults -join result_with_pid_share on result_with_pid_share.organization=allresults.organization -left outer join ( - select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; - -compute stats indi_org_findable; - -create table indi_org_openess stored as parquet as -WITH datasets_oa as ( - SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg - join openaire_prod_stats.result_organization ro on dg.id=ro.id - join openaire_prod_stats.dataset ds on dg.id=ds.id - WHERE dg.is_gold=1 - group by ro.organization), -software_oa as ( - SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg - join openaire_prod_stats.result_organization ro on dg.id=ro.id - join openaire_prod_stats.software ds on dg.id=ds.id - WHERE dg.is_gold=1 - group by ro.organization), -pubs_oa as ( - SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg - join openaire_prod_stats.result_organization ro on dg.id=ro.id - join openaire_prod_stats.publication ds on dg.id=ds.id - where dg.is_gold=1 - group by ro.organization), -allpubs as ( - SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro - join openaire_prod_stats.publication ps on ps.id=ro.id - group by ro.organization), -alldatasets as ( - SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro - join openaire_prod_stats.dataset ps on ps.id=ro.id - group by ro.organization), -allsoftware as ( - SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro - join openaire_prod_stats.software ps on ps.id=ro.id - group by ro.organization), -allpubsshare as ( - select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization), -alldatasetssshare as ( - select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c - from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization), -allsoftwaresshare as ( - select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s - from allsoftware - join software_oa on allsoftware.organization=software_oa.organization) -select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess -FROM allpubsshare -left outer join ( - select organization,c from - alldatasetssshare) tmp on tmp.organization=allpubsshare.organization -left outer join ( - select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; - -compute stats indi_org_openess; +--create table indi_software_gold_oa stored as parquet as +--WITH gold_oa AS ( +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_1 != "" +-- UNION +-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn +-- FROM stats_ext.oa_journals +-- WHERE issn_2 != "" ), +--issn AS ( +-- SELECT * +-- FROM ( +-- SELECT id,issn_printed as issn +-- FROM datasource +-- WHERE issn_printed IS NOT NULL +-- UNION +-- SELECT id, issn_online as issn +-- FROM datasource +-- WHERE issn_online IS NOT NULL ) as issn +-- WHERE LENGTH(issn) > 7) +--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +--FROM software_datasources pd +--LEFT OUTER JOIN ( +-- SELECT pd.id, 1 as is_gold FROM software_datasources pd +-- JOIN issn on issn.id=pd.datasource +-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; +-- +--compute stats indi_software_gold_oa; +--create table indi_org_findable stored as parquet as +--with result_with_pid as ( +-- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +-- join result_pids rp on rp.id=ro.id +-- group by ro.organization), +--result_has_abstract as ( +-- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro +-- join result rp on rp.id=ro.id where rp.abstract=true +-- group by ro.organization), +--allresults as ( +-- select organization, count(distinct id) no_allresults from result_organization +-- group by organization), +--result_with_pid_share as ( +-- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share +-- from allresults +-- join result_with_pid on result_with_pid.organization=allresults.organization), +--result_with_abstract_share as ( +-- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share +-- from allresults +-- join result_has_abstract on result_has_abstract.organization=allresults.organization) +--select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable +--from allresults +--join result_with_pid_share on result_with_pid_share.organization=allresults.organization +--left outer join ( +-- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; +-- +--compute stats indi_org_findable; +-- +--create table indi_org_openess stored as parquet as +--WITH datasets_oa as ( +-- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg +-- join result_organization ro on dg.id=ro.id +-- join dataset ds on dg.id=ds.id +-- WHERE dg.is_gold=1 +-- group by ro.organization), +--software_oa as ( +-- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg +-- join result_organization ro on dg.id=ro.id +-- join software ds on dg.id=ds.id +-- WHERE dg.is_gold=1 +-- group by ro.organization), +--pubs_oa as ( +-- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg +-- join result_organization ro on dg.id=ro.id +-- join publication ds on dg.id=ds.id +-- where dg.is_gold=1 +-- group by ro.organization), +--allpubs as ( +-- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro +-- join publication ps on ps.id=ro.id +-- group by ro.organization), +--alldatasets as ( +-- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro +-- join dataset ps on ps.id=ro.id +-- group by ro.organization), +--allsoftware as ( +-- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro +-- join software ps on ps.id=ro.id +-- group by ro.organization), +--allpubsshare as ( +-- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs +-- join pubs_oa on allpubs.organization=pubs_oa.organization), +--alldatasetssshare as ( +-- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c +-- from alldatasets +-- join datasets_oa on alldatasets.organization=datasets_oa.organization), +--allsoftwaresshare as ( +-- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s +-- from allsoftware +-- join software_oa on allsoftware.organization=software_oa.organization) +--select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess +--FROM allpubsshare +--left outer join ( +-- select organization,c from +-- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization +--left outer join ( +-- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; +-- +--compute stats indi_org_openess; +-- create table indi_pub_hybrid_oa_with_cc stored as parquet as -WITH hybrid_oa AS ( - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM stats_ext.plan_s_jn - WHERE issn_print != "" - UNION ALL - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM stats_ext.plan_s_jn - WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), -issn AS ( - SELECT * - FROM ( - SELECT id, issn_printed as issn - FROM datasource - WHERE issn_printed IS NOT NULL - UNION - SELECT id,issn_online as issn - FROM datasource - WHERE issn_online IS NOT NULL ) as issn + WITH hybrid_oa AS ( + SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn + FROM stats_ext.plan_s_jn + WHERE issn_print != "" + UNION ALL + SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn + FROM stats_ext.plan_s_jn + WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), + issn AS ( + SELECT * + FROM ( + SELECT id, issn_printed as issn + FROM datasource + WHERE issn_printed IS NOT NULL + UNION + SELECT id,issn_online as issn + FROM datasource + WHERE issn_online IS NOT NULL ) as issn WHERE LENGTH(issn) > 7) SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa FROM publication_datasources pd -LEFT OUTER JOIN ( + LEFT OUTER JOIN ( SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd - JOIN datasource d on d.id=pd.datasource - JOIN issn on issn.id=pd.datasource - JOIN hybrid_oa ON issn.issn = hybrid_oa.issn - JOIN indi_result_has_cc_licence cc on pd.id=cc.id + JOIN datasource d on d.id=pd.datasource + JOIN issn on issn.id=pd.datasource + JOIN hybrid_oa ON issn.issn = hybrid_oa.issn + JOIN indi_result_has_cc_licence cc on pd.id=cc.id where cc.has_cc_license=1) tmp on pd.id=tmp.id; compute stats indi_pub_hybrid_oa_with_cc; create table indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats -join publication on result_id=id + join publication on result_id=id where downloads>0 GROUP BY result_id order by no_dowloads desc; @@ -395,7 +429,7 @@ compute stats indi_pub_downloads; create table indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats -join publication on result_id=id + join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; @@ -404,7 +438,7 @@ compute stats indi_pub_downloads_datasource; create table indi_pub_downloads_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us -join publication on result_id=id where downloads>0 + join publication on result_id=id where downloads>0 GROUP BY result_id, `year` order by `year` asc; @@ -412,9 +446,400 @@ compute stats indi_pub_downloads_year; create table indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us -join publication on result_id=id + join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id, `year` order by `year` asc, result_id; -compute stats indi_pub_downloads_datasource_year; \ No newline at end of file +compute stats indi_pub_downloads_datasource_year; + +---- Sprint 7 ---- +create table indi_pub_gold_oa stored as parquet as + WITH gold_oa AS ( SELECT + issn_l, + journal_is_in_doaj, + journal_is_oa, + issn_1 as issn + FROM + stats_ext.oa_journals + WHERE + issn_1 != "" + UNION + ALL SELECT + issn_l, + journal_is_in_doaj, + journal_is_oa, + issn_2 as issn + FROM + stats_ext.oa_journals + WHERE + issn_2 != "" ), issn AS ( SELECT + * + FROM +( SELECT + id, + issn_printed as issn + FROM + datasource + WHERE + issn_printed IS NOT NULL + UNION + SELECT + id, + issn_online as issn + FROM + datasource + WHERE + issn_online IS NOT NULL or id like '%doajarticles%') as issn + WHERE + LENGTH(issn) > 7) +SELECT + DISTINCT pd.id, coalesce(is_gold, 0) as is_gold +FROM + publication_datasources pd + left outer join( + select pd.id, 1 as is_gold FROM publication_datasources pd + JOIN issn on issn.id=pd.datasource + JOIN gold_oa on issn.issn = gold_oa.issn) tmp + on pd.id=tmp.id; + +compute stats indi_pub_gold_oa; + +create table indi_pub_hybrid stored as parquet as + WITH gold_oa AS ( SELECT + issn_l, + journal_is_in_doaj, + journal_is_oa, + issn_1 as issn, + has_apc + FROM + stats_ext.oa_journals + WHERE + issn_1 != "" + UNION + ALL SELECT + issn_l, + journal_is_in_doaj, + journal_is_oa, + issn_2 as issn, + has_apc + FROM + stats_ext.oa_journals + WHERE + issn_2 != "" ), issn AS ( SELECT + * + FROM +( SELECT + id, + issn_printed as issn + FROM + datasource + WHERE + issn_printed IS NOT NULL + UNION + SELECT + id, + issn_online as issn + FROM + datasource + WHERE + issn_online IS NOT NULL or id like '%doajarticles%') as issn + WHERE + LENGTH(issn) > 7) +select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid +from publication_datasources pd + left outer join ( + select pd.id, 1 as is_hybrid from publication_datasources pd + join datasource d on d.id=pd.datasource + join issn on issn.id=pd.datasource + join gold_oa on issn.issn=gold_oa.issn + where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp + on pd.id=tmp.id; + +compute stats indi_pub_hybrid; + +create table indi_org_fairness stored as parquet as +--return results with PIDs, and rich metadata group by organization + with result_fair as + (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result r on r.id=ro.id +--join result_pids rp on r.id=rp.id + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003 + group by ro.organization), +--return all results group by organization + allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + join result r on r.id=ro.id + where year>2003 + group by organization) +--return results_fair/all_results +select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +from allresults + join result_fair on result_fair.organization=allresults.organization; + +compute stats indi_org_fairness; + +create table indi_org_fairness_pub_pr stored as parquet as + with result_fair as + (select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join publication p on p.id=ro.id + join indi_pub_doi_from_crossref dc on dc.id=p.id + join indi_pub_grey_lit gl on gl.id=p.id + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 + group by ro.organization), + allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + join publication p on p.id=ro.id + where cast(year as int)>2003 + group by organization) +--return results_fair/all_results +select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +from allresults + join result_fair on result_fair.organization=allresults.organization; + +compute stats indi_org_fairness_pub_pr; + +create table indi_org_fairness_pub_year stored as parquet as + with result_fair as + (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join publication p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year), + allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join publication p on p.id=ro.id + where cast(year as int)>2003 + group by organization, year) +select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + +compute stats indi_org_fairness_pub_year; + +create table indi_org_fairness_pub as +with result_fair as + (select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join publication p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + and (authors>0) and cast(year as int)>2003 + group by ro.organization), + allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + join publication p on p.id=ro.id + where cast(year as int)>2003 + group by organization) +select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +from allresults + join result_fair on result_fair.organization=allresults.organization; + +compute stats indi_org_fairness_pub; + +create table indi_org_fairness_year stored as parquet as + with result_fair as + (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result r on r.id=ro.id + join result_pids rp on r.id=rp.id + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003 + group by ro.organization, year), + allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result r on r.id=ro.id + where year>2003 + group by organization, year) +--return results_fair/all_results +select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness +from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + +compute stats indi_org_fairness_year; + +create table indi_org_findable_year stored as parquet as +--return results with PIDs group by organization,year + with result_with_pid as + (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro + join result_pids rp on rp.id=ro.id + join result r on r.id=rp.id + where year >2003 + group by ro.organization, year), +--return all results group by organization,year + allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result r on r.id=ro.id + where year >2003 + group by organization, year) +--return results_with_pid/all_results +select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +from allresults + join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; + +compute stats indi_org_findable_year; + +create table indi_org_findable stored as parquet as +--return results with PIDs group by organization + with result_with_pid as + (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro + join result_pids rp on rp.id=ro.id + join result r on r.id=rp.id + where year >2003 + group by ro.organization), +--return all results group by organization + allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + join result r on r.id=ro.id + where year >2003 + group by organization) +--return results_with_pid/all_results +select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable +from allresults + join result_with_pid on result_with_pid.organization=allresults.organization; + +compute stats indi_org_findable; + +create table indi_org_openess stored as parquet as + WITH pubs_oa as ( + SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization), + datasets_oa as ( + SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization), + software_oa as ( + SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization), + allpubs as ( + SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + join publication ps on ps.id=ro.id + where cast(ps.year as int)>2003 + group by ro.organization), + alldatasets as ( + SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + join dataset ps on ps.id=ro.id + where cast(ps.year as int)>2003 + group by ro.organization), + allsoftware as ( + SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + join software ps on ps.id=ro.id + where cast(ps.year as int)>2003 + group by ro.organization), + allpubsshare as ( + select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization), + alldatasetssshare as ( + select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + from alldatasets + join datasets_oa on alldatasets.organization=datasets_oa.organization), + allsoftwaresshare as ( + select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + from allsoftware + join software_oa on allsoftware.organization=software_oa.organization) +select allpubsshare.organization, + (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + +(case when d is null then 0 else 1 end)) + org_openess FROM allpubsshare + left outer join (select organization,d from + alldatasetssshare) tmp1 + on tmp1.organization=allpubsshare.organization + left outer join (select organization,s from + allsoftwaresshare) tmp2 + on tmp2.organization=allpubsshare.organization; + +compute stats indi_org_openess; + +create table indi_org_openess_year stored as parquet as + WITH pubs_oa as ( + SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization,r.year), + datasets_oa as ( + SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization, r.year), + software_oa as ( + SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r + join result_organization ro on ro.id=r.id + join result_instance ri on ri.id=r.id + where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') + and cast(r.year as int)>2003 + group by ro.organization, r.year), + allpubs as ( + SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + join publication p on p.id=ro.id where cast(p.year as int)>2003 + group by ro.organization, p.year), + alldatasets as ( + SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + join dataset d on d.id=ro.id where cast(d.year as int)>2003 + group by ro.organization, d.year), + allsoftware as ( + SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + join software s on s.id=ro.id where cast(s.year as int)>2003 + group by ro.organization, s.year), + allpubsshare as ( + select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), + alldatasetssshare as ( + select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + from alldatasets + join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), + allsoftwaresshare as ( + select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + from allsoftware + join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) +select allpubsshare.year, allpubsshare.organization, + (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + +(case when d is null then 0 else 1 end)) + org_openess FROM allpubsshare + left outer join (select year, organization,d from + alldatasetssshare) tmp1 + on tmp1.organization=allpubsshare.organization and tmp1.year=allpubsshare.year + left outer join (select year, organization,s from + allsoftwaresshare) tmp2 + on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; + +compute stats indi_org_openess_year; + +create table indi_pub_has_preprint stored as parquet as +select distinct p.id, coalesce(has_preprint, 0) as has_preprint +from publication_classifications p + left outer join ( + select p.id, 1 as has_preprint + from publication_classifications p + where p.type='Preprint') tmp + on p.id= tmp.id; + +compute stats indi_pub_has_preprint; + +create table indi_pub_in_subscribed stored as parquet as +select distinct p.id, coalesce(is_subscription, 0) as is_subscription +from publication p + left outer join( + select p.id, 1 as is_subscription from publication p + join indi_pub_gold_oa g on p.id=g.id + join indi_pub_hybrid h on p.id=h.id + join indi_pub_in_transformative t on p.id=t.id + where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp + on p.id=tmp.id; + +compute stats indi_pub_in_subscribed; + +create table indi_result_with_pid as +select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid +from result p + left outer join ( + select p.id, 1 as result_with_pid + from result_pids p) tmp + on p.id= tmp.id; + +compute stats indi_result_with_pid; \ No newline at end of file From eb53b52f7c5e1ca1db48171d76379ab9b364b050 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Aug 2022 13:24:47 +0200 Subject: [PATCH 31/32] code formatting --- .../eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 0f82ecadf5..088a07427d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -587,7 +587,7 @@ object DataciteToOAFTransformation { val oid = result.getId result.setId(IdentifierFactory.createIdentifier(result)) if (!result.getId.equalsIgnoreCase(oid)) { - result.setOriginalId((oid::List(doi)).asJava) + result.setOriginalId((oid :: List(doi)).asJava) } var relations: List[Relation] = From 8b0407d8ecb8cb793247ec0d72348f78a042d929 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 3 Aug 2022 12:26:59 +0300 Subject: [PATCH 32/32] fixed the datasourceOrganization relations --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 6fa0e6fdfb..01bed17cc1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- datasource sources: -- where the datasource info have been collected from.