diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 775f228eb..363f95423 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -211,7 +211,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .orElse(s.getValue()), Function.identity(), (s1, s2) -> Collections - .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) .values()); r.setSubject(subjects); } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index c29614d33..a7ad9e2d6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -49,7 +49,7 @@ object DataciteToOAFTransformation { /** This method should skip record if json contains invalid text * defined in file datacite_filter * - * @param record : unparsed datacite record + * @param record : not parsed Datacite record * @param json : parsed record * @return True if the record should be skipped */ @@ -98,6 +98,10 @@ object DataciteToOAFTransformation { } + /** This utility method indicates whether the embargo date has been reached + * @param embargo_end_date + * @return True if the embargo date has been reached, false otherwise + */ def embargo_end(embargo_end_date: String): Boolean = { val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) val td = LocalDate.now() @@ -142,6 +146,21 @@ object DataciteToOAFTransformation { } } + /** * + * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type. + * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym + * to generate one of the following main entities: + * - publication + * - dataset + * - software + * - otherresearchproduct + * + * @param resourceType + * @param resourceTypeGeneral + * @param schemaOrg + * @param vocabularies + * @return + */ def getTypeQualifier( resourceType: String, resourceTypeGeneral: String, @@ -330,6 +349,7 @@ object DataciteToOAFTransformation { if (result == null) return List() + // DOI is mapped on a PID inside a Instance object val doi_q = OafMapperUtils.qualifier( "doi", "doi", @@ -338,6 +358,8 @@ object DataciteToOAFTransformation { ) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) result.setPid(List(pid).asJava) + + // This identifiere will be replaced in a second moment using the PID logic generation result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) result.setOriginalId(List(doi).asJava) @@ -386,6 +408,10 @@ object DataciteToOAFTransformation { a } + if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) + return List() + result.setAuthor(authors.asJava) + val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) result.setTitle( @@ -409,10 +435,6 @@ object DataciteToOAFTransformation { .asJava ) - if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) - return List() - result.setAuthor(authors.asJava) - val dates = (json \\ "dates").extract[List[DateType]] val publication_year = (json \\ "publicationYear").extractOrElse[String](null) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index f92040c24..ba7c7dd01 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -554,7 +554,7 @@ public class PublicationToOaf implements Serializable { private KeyValue createCollectedFrom() { KeyValue cf = new KeyValue(); cf.setValue(ModelConstants.ORCID.toUpperCase()); - cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "cd0f74b5955dc87fd0605745c4b49ee8"); return cf; } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index 9286c7385..d6498e942 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -134,13 +134,19 @@ public class ResultTagger implements Serializable { /* Tagging for Advanced Constraints */ final Set aconstraints = new HashSet<>(); - conf.getSelectionConstraintsMap().keySet() - .forEach(communityId -> { - if(conf.getSelectionConstraintsMap().get(communityId) != null && - conf.getSelectionConstraintsMap().get(communityId) - .getCriteria().stream().anyMatch(crit -> crit.verifyCriteria(param))) - aconstraints.add(communityId); - }); + conf + .getSelectionConstraintsMap() + .keySet() + .forEach(communityId -> { + if (conf.getSelectionConstraintsMap().get(communityId) != null && + conf + .getSelectionConstraintsMap() + .get(communityId) + .getCriteria() + .stream() + .anyMatch(crit -> crit.verifyCriteria(param))) + aconstraints.add(communityId); + }); communities.addAll(aconstraints); @@ -152,7 +158,7 @@ public class ResultTagger implements Serializable { } result.getContext().forEach(c -> { - String cId = c.getId(); + final String cId = c.getId(); if (communities.contains(cId)) { Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo()); List dataInfoList; @@ -164,21 +170,48 @@ public class ResultTagger implements Serializable { } if (subjects.contains(cId)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, + DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (datasources.contains(cId)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, + DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (czenodo.contains(cId)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, + DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (aconstraints.contains(cId)) dataInfoList - .add( - OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); } }); @@ -199,21 +232,48 @@ public class ResultTagger implements Serializable { List dataInfoList = new ArrayList<>(); if (subjects.contains(c)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, + DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (datasources.contains(c)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (czenodo.contains(c)) dataInfoList - .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, + DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); if (aconstraints.contains(c)) dataInfoList - .add( - OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false, - OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); + .add( + OafMapperUtils + .dataInfo( + false, BULKTAG_DATA_INFO_TYPE, true, false, + OafMapperUtils + .qualifier( + CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, + DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), + TAGGING_TRUST)); context.setDataInfo(dataInfoList); return context; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java index a4a6f5663..501eb51b9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("contains_ignorecase") +@VerbClass("contains_caseinsensitive") public class ContainsVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java index c5f0ce070..1cd07755c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("equals_ignorecase") +@VerbClass("equals_caseinsensitive") public class EqualVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java index b21be83f0..e12b65a27 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("not_contains_ignorecase") +@VerbClass("not_contains_caseinsensitive") public class NotContainsVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java index c6958a641..c1749621e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("not_equals_ignorecase") +@VerbClass("not_equals_caseinsensitive") public class NotEqualVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml index 06c57511d..4e580edf5 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml @@ -1193,7 +1193,7 @@ - {"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"}, + {"criteria":[{"constraint":[{"verb":"equals_caseinsensitive","field":"subject","value":"ciencias de la comunicación"}, {"verb":"equals","field":"subject","value":"Miriam"}]}, {"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]} @@ -1317,81 +1317,81 @@ opendoar____::358aee4cc897452c00244351e4d91f69 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}}]} re3data_____::7b0ad08687b2c960d5aeef06f811d5e6 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} driver______::bee53aa31dc2cbb538c10c2b65fa5824 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::437f4b072b1aa198adcbc35910ff3b98 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::081b82f96300b6a6e3d282bad31cb6e2 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::9e3be59865b2c1c335d32dae2fe7b254 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} opendoar____::8b6dd7db9af49e67306feb59a8bdc52c - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} share_______::4719356ec8d7d55d3feb384ce879ad6c - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} share_______::bbd802baad85d1fd440f32a7a3a2c2b1 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} opendoar____::6f4922f45568161a8cdf4ad2299f6d23 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} re3data_____::7980778c78fb4cf0fab13ce2159030dc - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]} re3data_____::978378def740bbf2bfb420de868c460b - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties b/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties new file mode 100644 index 000000000..ce37270c6 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties @@ -0,0 +1,25 @@ +# Root logger option +log4j.rootLogger=DEBUG, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n + +# Change this to set Spark log level +log4j.logger.org.apache.spark=ERROR +log4j.rootCategory=WARN + +# Silence akka remoting +log4j.logger.Remoting=WARN + +# Ignore messages below warning level from Jetty, because it's a bit verbose +log4j.logger.org.eclipse.jetty=WARN + +log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN +log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN +log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN +log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN +log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN +log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 147e26699..5f3b4e1ca 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import java.util.Objects; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.SerializationUtils; @@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer modified = new AtomicReference<>(false); + vocabularies.find(vocabularyId).ifPresent(vocabulary -> { - if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { - return; - } - Qualifier newValue = vocabulary.lookup(subject.getValue()); - if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { - subject.setValue(newValue.getClassid()); - subject.getQualifier().setClassid(vocabularyId); - subject.getQualifier().setClassname(vocabulary.getName()); - modified.set(true); + if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { + Qualifier newValue = vocabulary.lookup(subject.getValue()); + if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { + subject.setValue(newValue.getClassid()); + subject.getQualifier().setClassid(vocabularyId); + subject.getQualifier().setClassname(vocabulary.getName()); + } + } else if (vocabularyId.equals(subject.getQualifier().getClassid())) { + Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue()); + VocabularyTerm term = vocabulary.getTerm(subject.getValue()); + if (Objects.isNull(syn) && Objects.isNull(term)) { + subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD); + subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD); + } } }); - return modified.get(); } private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java index c150c63df..45590f789 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java @@ -43,7 +43,7 @@ public class CleanCountrySparkJob implements Serializable { String jsonConfiguration = IOUtils .toString( - CleanContextSparkJob.class + CleanCountrySparkJob.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -117,7 +117,7 @@ public class CleanCountrySparkJob implements Serializable { p -> p .getQualifier() .getClassid() - .equals(PidType.doi) && pidInParam(p.getValue(), verifyParam))) { + .equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) { r .setCountry( r diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java index dd5af6998..d3741d3e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java @@ -65,7 +65,6 @@ public class GetDatasourceFromCountry implements Serializable { conf, isSparkSessionManaged, spark -> { - getDatasourceFromCountry(spark, country, inputPath, workingPath); }); } @@ -83,7 +82,6 @@ public class GetDatasourceFromCountry implements Serializable { (FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() && o.getCountry().getClassid().length() > 0 && o.getCountry().getClassid().equals(country)); - ; // filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass Dataset relation = spark @@ -97,7 +95,7 @@ public class GetDatasourceFromCountry implements Serializable { !rel.getDataInfo().getDeletedbyinference()); organization - .joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left") + .joinWith(relation, organization.col("id").equalTo(relation.col("target"))) .map((MapFunction, String>) t2 -> t2._2().getSource(), Encoders.STRING()) .write() .mode(SaveMode.Overwrite) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index c157be51a..7aa40cb8a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -366,6 +366,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setInstance(instances); r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); + r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info)); } protected abstract List prepareResultPids(Document doc, DataInfo info); @@ -384,6 +385,25 @@ public abstract class AbstractMdRecordToOafMapper { return list; } + private List prepareEOSCIfGuidelines(Document doc, DataInfo info) { + final Set set = Sets.newHashSet(); + for (final Object o : doc.selectNodes("//oaf:eoscifguidelines")) { + final String code = ((Node) o).valueOf("@code"); + final String label = ((Node) o).valueOf("@label"); + final String url = ((Node) o).valueOf("@url"); + final String semrel = ((Node) o).valueOf("@semanticrelation"); + if (StringUtils.isNotBlank(code)) { + final EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode(code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semrel); + set.add(eig); + } + } + return Lists.newArrayList(set); + } + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index a25bcd47e..39c77bd37 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -177,6 +177,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='w3id']")) { + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); + } Set validUrl = validateUrl(url); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 2ba0a7ad7..6435d5131 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -432,14 +432,14 @@ - + - + yarn cluster - Clean publications context + Select datasource ID from country eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry dhp-graph-mapper-${projectVersion}.jar @@ -471,7 +471,7 @@ yarn cluster - Clean publications counmtry + Clean publication country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -489,10 +489,10 @@ --workingPath${workingDir}/working/publication --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + @@ -500,7 +500,7 @@ yarn cluster - Clean datasets Country + Clean dataset country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -518,10 +518,10 @@ --workingPath${workingDir}/working/dataset --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + @@ -529,7 +529,7 @@ yarn cluster - Clean otherresearchproducts country + Clean otherresearchproduct country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -547,10 +547,10 @@ --workingPath${workingDir}/working/otherresearchproduct --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + @@ -558,7 +558,7 @@ yarn cluster - Clean softwares country + Clean software country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -576,7 +576,7 @@ --workingPath${workingDir}/working/software --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index ba5f4f375..4468382be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -126,6 +126,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions8000 @@ -152,6 +153,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Dataset --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions4000 @@ -178,6 +180,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions3000 @@ -204,6 +207,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Software --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions300 @@ -230,6 +234,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Datasource --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions100 @@ -256,6 +261,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Organization --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions400 @@ -309,6 +315,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Relation --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions10000 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 556106180..362cb2028 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset { .map(s => mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) - .filter(r => filterRelations(subRelTypeFilter, relClassFilter, r)) - //filter OpenCitations relations - .filter(r => - r.getDataInfo.getProvenanceaction != null && - !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid) - ) + .filter(r => filterRelations(r)) + //filter OpenCitations relations +// .filter(r => +// r.getDataInfo.getProvenanceaction != null && +// !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid) +// ) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") } - private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = { - if (StringUtils.isNotBlank(subRelTypeFilter)) { - subRelTypeFilter.equalsIgnoreCase(r.getSubRelType) - } else { - !relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)) + private def filterRelations(r: Relation): Boolean = { + + /** * + * We filter relation generated by dedups + * and all the relation that have one single collectedFrom OpenCitation + */ + + val relClassFilter = List( + ModelConstants.MERGES, + ModelConstants.IS_MERGED_IN, + ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS, + ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS + ) + if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) + false + else { + if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0) + false + else if (r.getCollectedfrom.size() > 1) + true + else if ( + r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase( + r.getCollectedfrom.get(0).getValue + ) + ) + false + else + true } } - /* - //TODO: finalise implementation - private def processResult[T<: Result]( - implicit ct: ClassTag[T], - log: Logger, - spark: SparkSession, - sourcePath: String, - entityPath: String, - clazz: Class[T] - ): Unit = { - val entityType = clazz.getSimpleName.toLowerCase - - log.info(s"Converting $entityType") - - val mapper = new ObjectMapper() with ScalaObjectMapper - mapper.registerModule(DefaultScalaModule) - - val rdd = spark.sparkContext - .textFile(s"$sourcePath/$entityType") - .map(s => mapper.readValue(s, clazz)) - .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference); - - implicit val encoder: Encoder[T] = Encoders.kryo(clazz) - spark - .createDataset(rdd) - .as[T] - .write - .mode(SaveMode.Overwrite) - .save(s"$entityPath/$entityType") - } - */ - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 6c43da832..4035307e5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -278,6 +278,16 @@ public class GraphCleaningFunctionsTest { s -> "0102 computer and information sciences".equals(s.getValue()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + List s1 = p_cleaned + .getSubject() + .stream() + .filter(s -> s.getValue().equals("In Situ Hybridization")) + .collect(Collectors.toList()); + assertNotNull(s1); + assertEquals(1, s1.size()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassid()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassname()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 7552d1789..3e35021c8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -936,11 +936,23 @@ class MappersTest { System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); -// final OtherResearchProduct p = (OtherResearchProduct) list.get(0); -// assertValidId(p.getId()); -// assertValidId(p.getCollectedfrom().get(0).getKey()); -// System.out.println(p.getTitle().get(0).getValue()); -// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals(5, list.size()); + final OtherResearchProduct p = (OtherResearchProduct) list.get(0); + assertValidId(p.getId()); + assertTrue(p.getId().startsWith("50|w3id")); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals(1, p.getInstance().size()); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", p.getPid().get(0).getValue()); + Instance inst = p.getInstance().get(0); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue()); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0)); + assertEquals(1, p.getEoscifguidelines().size()); + assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getCode()); + assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getLabel()); + assertEquals("", p.getEoscifguidelines().get(0).getUrl()); + assertEquals("compliesWith", p.getEoscifguidelines().get(0).getSemanticRelation()); + } @Test @@ -988,6 +1000,17 @@ class MappersTest { } + @Test + void testEOSCFuture_ROHub() throws IOException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("photic-zone-transformed.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + final OtherResearchProduct rocrate = (OtherResearchProduct) list.get(0); + assertNotNull(rocrate.getEoscifguidelines()); + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(rocrate)); + System.out.println("***************"); + } + @Test void testNotWellFormed() throws IOException { final String xml = IOUtils diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8e4fc4545..84ff35c08 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -706,6 +706,28 @@ "source": [ ], "subject": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": false, + "trust": "0.9" + }, + "qualifier": { + "classid": "FOS", + "classname": "Fields of Science and Technology classification", + "schemeid": "dnet:result_subject", + "schemename": "dnet:result_subject" + }, + "value": "In Situ Hybridization" + }, { "dataInfo": { "deletedbyinference": false, diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml new file mode 100644 index 000000000..22bf0577e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml @@ -0,0 +1,108 @@ + + +
+ fsh_____4119::68126da991bd76d8be494bddfbf7a1bb + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + + + + + 2022-11-15T12:29:19Z + 2022-11-15T12:29:19Z + fsh_____4119 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 2022-11-15T12:29:19Z + rohub_data + ro-crate_data +
+ + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74 + + RO-crate + + Creative Commons Attribution 4.0 International + open access + + + Mapping the photic zone of the Mediterranean Sea + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + CNR-ISMAR + + + Giorgio Castellan + + + Lorenzo Angeletti + + + Paolo Montagna + + + Marco Taviani + + + + 2022-11-14T16:32:45Z + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + 2022 + + open access + + + 813.478 KB + + + Earth sciences + Ecology + Optics + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 0048 + 2022-11-14 + OPEN + https://creativecommons.org/licenses/by/4.0/legalcode + + + + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml index c85b55786..18f637ecc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml @@ -21,15 +21,13 @@ - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca - + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + - - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce - - - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d - + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d RO-crate @@ -43,21 +41,17 @@ Poznań Supercomputing and Networking Center - - - Generation Service - + + Generation Service - - CNR-ISMAR - + CNR-ISMAR - 2018-06-20T11:21:46Z + 2018-06-20T11:21:46Z The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved. @@ -71,15 +65,18 @@ Ecology - EOSC::RO-crate - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca other research product - + 2018-06-20 OPEN + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java new file mode 100644 index 000000000..08bf19fe4 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java @@ -0,0 +1,88 @@ +package eu.dnetlib.dhp.oa.provision; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; +import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import org.apache.commons.io.IOUtils; +import org.apache.solr.client.solrj.util.ClientUtils; +import org.apache.solr.common.SolrInputDocument; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.io.SAXReader; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import java.io.IOException; +import java.io.StringReader; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class EOSCFuture_Test { + + public static ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + public static final String VERSION = "2021-04-15T10:05:53Z"; + public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl"; + + private ContextMapper contextMapper; + + @BeforeEach + public void setUp() { + contextMapper = new ContextMapper(); + } + + + @Test + public void testEOSC_ROHub() throws IOException, DocumentException, TransformerException { + + final ContextMapper contextMapper = new ContextMapper(); + + final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, + XmlConverterJob.schemaLocation); + + final OtherResearchProduct p = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("eosc-future/photic-zone.json")), OtherResearchProduct.class); + + final String xml = xmlRecordFactory.build(new JoinedEntity<>(p)); + + assertNotNull(xml); + + final Document doc = new SAXReader().read(new StringReader(xml)); + + assertNotNull(doc); + System.out.println(doc.asXML()); + + + testRecordTransformation(xml); + } + + + private void testRecordTransformation(final String record) throws IOException, TransformerException { + final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); + final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")); + + final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt); + + final Transformer tr = SaxonTransformerFactory.newInstance(transformer); + + final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record); + + final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID) + .parseDocument(indexRecordXML); + + final String xmlDoc = ClientUtils.toXML(solrDoc); + + Assertions.assertNotNull(xmlDoc); + System.out.println(xmlDoc); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index e0fbb2a2f..17c3cdb30 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -128,6 +128,20 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } + @Test + public void testForEOSCFutureSoftwareNotebook() throws IOException, TransformerException { + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/software-justthink.xml")); + testRecordTransformation(record); + } + + @Test + public void testForEOSCFutureSoftwareNotebookClaim() throws IOException, TransformerException { + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/software-justthink-claim.xml")); + testRecordTransformation(record); + } + @Test void testDoiUrlNormalization() throws MalformedURLException { diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json new file mode 100644 index 000000000..9729c6051 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json @@ -0,0 +1 @@ +{"collectedfrom":[{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1669134693781,"id":"50|w3id________::68126da991bd76d8be494bddfbf7a1bb","originalId":["50|fsh_____4119::68126da991bd76d8be494bddfbf7a1bb","https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofcollection":"2022-11-15T12:29:19Z","dateoftransformation":"2022-11-15T12:29:19Z","extraInfo":[],"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":null,"author":[{"fullname":"Giorgio Castellan","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Lorenzo Angeletti","name":"","surname":"","rank":2,"pid":[],"affiliation":[]},{"fullname":"Paolo Montagna","name":"","surname":"","rank":3,"pid":[],"affiliation":[]},{"fullname":"Marco Taviani","name":"","surname":"","rank":4,"pid":[],"affiliation":[]}],"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"Earth sciences","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Ecology","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Optics","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Mapping the photic zone of the Mediterranean Sea","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2022-11-14T16:32:45Z","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"CNR-ISMAR","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"RO-crate","classname":"RO-crate","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by/4.0/legalcode","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0048","classname":"Research Object","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"url":["https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"distributionlocation":null,"collectedfrom":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"alternateIdentifier":[],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"measures":null}],"eoscifguidelines":[{"code":"EOSC::Jupyter Notebook","label":"EOSC::Jupyter Notebook","url":"","semanticRelation":"compliesWith"},{"code":"EOSC::Data Cube","label":"EOSC::Data Cube","url":"","semanticRelation":"compliesWith"},{"code":"EOSC::RO-crate","label":"EOSC::RO-crate","url":"","semanticRelation":"compliesWith"}],"contactperson":[],"contactgroup":[],"tool":[]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml new file mode 100644 index 000000000..02089bb30 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml @@ -0,0 +1,305 @@ + + +
+ od______2659::3801993ea8f970cfc991277160edf277 + 2022-08-08T03:06:13Z + under curation + +
+ + + + JUSThink + Alignment Analysis + Norman, Utku + Dinkar, Tanvi + Bruno, Barbara + Clavel, Chloé + + + + +

+ 1. Description +

+

This repository contains tools to automatically analyse how + participants align their use of task-specific referents in their + dialogue and actions for a collaborative learning activity, and how + it relates to the task success (i.e. their learning + outcomes and task performance).

+

As a use case, it processes data from a collaborative problem solving + activity named JUSThink [1, 2], i.e. + JUSThink Dialogue and Actions Corpus data set that is available from the + Zenodo Repository, DOI: 10.5281/zenodo.4627104, and reproduces the results and figures + in [3].

+

In brief:

+
    +
  1. JUSThink Dialogue and Actions Corpus contains + transcripts, event logs, and test responses of children aged 9 + through 12, as they participate in the JUSThink activity [1, 2] + in pairs of two, to solve a problem on graphs together.
  2. +
  3. The JUSThink activity and its study is first + described in [1], and elaborated with findings concerning the link + between children's learning, performance in the activity, and + perception of self, the other and the robot in [2].
  4. +
  5. Alignment analysis in our work [3] studies the participants' use of + expressions that are related to the task at hand, their follow up + actions of these expressions, and how it links to task success.
  6. +
+

+ 2. Publications +

+

If you use this work in an academic context, please cite the following + publications:

+
    +
  • +

    Norman*, U., Dinkar*, T., Bruno, B., & Clavel, C. (2022). + Studying Alignment in a Collaborative Learning Activity via + Automatic Methods: The Link Between What We Say and Do. Dialogue + & Discourse, 13(2), 1 - ;48. *Contributed equally to this + work. https://doi.org/10.5210/dad.2022.201

    +
  • +
  • +

    Norman, U., Dinkar, T., Bruno, B., & Clavel, C. (2021). + JUSThink Alignment Analysis. In Dialogue & Discourse + (v1.0.0, Vol. 13, Number 2, pp. 1 - ;48). Zenodo. https://doi.org/10.5281/zenodo.4675070

    +
  • +
+

+ 3. Content +

+

The tools provided in this repository consists of 7 Jupyter Notebooks + written in Python 3, and two additional external tools utilised by the + notebooks.

+

+ 3.1. Jupyter Notebooks +

+

We highlight that the notebooks up until the last (i.e. to test the + hypotheses (tools/7_test_the_hypotheses.ipynb)) present a general + pipeline to process event logs, test responses and transcripts to + extract measures of task performance, learning outcomes, and measures of + alignment.

+
    +
  1. Extract task performance (and other features) from the logs + (tools/1_extract_performance_and_other_features_from_logs.ipynb): + Extracts various measures of task behaviour from the logs, at + varying granularities of the activity (i.e. the whole corpus, task, + attempt, and turn levels). In later notebooks, we focus on one of + the features to estimate the task performance of a team: (minimum) + error.
  2. +
  3. Extract learning outcomes from the test responses + (tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts + measures of learning outcomes from the responses to the pre-test and + the post-test. In later notebooks, we focus on one of the features + to estimate the learning outcome of a team: relative learning gain + [4]
  4. +
  5. Select and visualise a subset of teams for + transcription + (tools/3_visualise_transcribed_teams.ipynb): Visualises the + transcribed teams among the other teams in the feature space spanned + by task performance and learning outcome, as well as the + distribution of their number of attempts and turns.
  6. +
  7. Extract routines from transcripts + (tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to + extract routines): Extracts routines of referring expressions that + are "fixed", i.e. become shared or established amongst + interlocutors.
  8. +
  9. Combine transcripts with logs + (tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): + Merges transcripts with event logs to have a combined dialogue and + actions corpus, to be processed e.g. to detect follow-up + actions.
  10. +
  11. Recognise instructions and detect follow-up actions + (tools/6_recognise_instructions_detect_follow-up_actions.ipynb): + Extracts verbalised instruction such as "connect Mount Basel to + Montreux", and pairs them with the follow-up action that may + match (e.g. if the other connects Basel to Montreux) or + mismatch (e.g. if the other connects Basel to + Neuchatel) with the instruction.
  12. +
  13. Test the hypotheses in [3] (tools/7_test_the_hypotheses.ipynb) (uses + effsize to estimate effect size, specifically + Cliff's Delta): Considers each research questions and hypotheses + studied in [3] and generates the results in [3].
  14. +
+

+ 3.2. External Tools +

+
    +
  1. dialign + tool to extract routines, specifically Release 1.0 from dialign-1.0.zip:\n It extracts routine expressions that are + "shared" among the participants from transcripts. \n It is + used as an external module (in accordance with its CeCILL-B License, + see License).
  2. +
  3. effsize tool to compute estimators of effect + size.\n We specifically use it to compute Cliff's Delta, which + quantifies the amount difference between two groups of observations, + by computing the Cliff's Delta statistic.\n It is taken from + project DABEST (see License).
  4. +
+

+ 4. Research Questions and Hypotheses in [3] +

+
    +
  • RQ1 Lexical alignment: How do the interlocutors + use expressions related to the task? Is this associated + with task success?
      +
    • H1.1: Task-specific referents become + routine early for more successful teams.
    • +
    • H1.2: Hesitation phenomena are more likely + to occur in the vicinity of priming and establishment of + task-specific referents for more successful teams.
    • +
    +
  • +
  • RQ2 Behavioural alignment: How do the interlocutors + follow up these expressions with actions? Is this + associated with task success?
      +
    • H2.1: Instructions are more likely to be + followed by a corresponding action early in the dialogue for + more successful teams.
    • +
    • H2.2: When instructions are followed by a + corresponding or a different action, the action is more + likely to be in the vicinity of information management + phenomena for more successful teams.
    • +
    +
  • +
+

The RQs and Hs are addressed in the notebook for testing the hypotheses + (i.e. tools/7_test_the_hypotheses.ipynb).

+

+ Acknowledgements +

+

This project has received funding from the European Union's Horizon + 2020 research and innovation programme under grant agreement No 765955. + Namely, the ANIMATAS Project.

+

+ License +

+

The whole package is under MIT License, see the LICENSE + file.

+

Classes under the tools/effsize package were taken from + project DABEST, Copyright 2016-2020 Joses W. Ho. + These classes are licensed under the BSD 3-Clause Clear License. See + tools/effsize/LICENSE file for additional + details.

+

Classes under the tools/dialign-1.0 package were taken + from project dialign. These classes are licensed under the + CeCILL-B License. This package is used as an "external + module", see tools/dialign-1.0/LICENSE.txt for + additional details.

+
+ + + + Zenodo + + + + + + + + + + + + + + + + + + + oai:zenodo.org:4675070 + + oai:zenodo.org:4675070 + 10.5281/zenodo.4675070 + + + + false + false + 0.9 + + + + + + corda__h2020::c4515ebef538a734cf11f795347f5dac + 765955 + ANIMATAS + Advancing intuitive human-machine interaction with human-like + social capabilities for education in schools + + + + ec__________::EC::H2020 + + + + + + + + + + + + + https://zenodo.org/record/4675070 + + + +
+
+
+
+
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml new file mode 100644 index 000000000..9c0f4ea7d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml @@ -0,0 +1,429 @@ + + +
+ doi_dedup___::c054151b6a8c4f41c7acf160651a6503 + 2022-10-13T00:15:44+0000 + 2022-10-13T07:44:29.152Z +
+ + + + + + oai:zenodo.org:4675070 + 50|od______2659::3801993ea8f970cfc991277160edf277 + oai:zenodo.org:6974562 + 50|od______2659::9c87ff4a5e7710052b873088e7265072 + 10.5281/zenodo.4675069 + 10.5281/zenodo.4675070 + 10.5281/zenodo.6974562 + 10.5281/zenodo.4675069 + + + + + + JUSThink Alignment + Analysis + + Norman, Utku + Dinkar, Tanvi + Bruno, Barbara + Clavel, Chloé + 2022-08-08 + &lt;strong>1. Description&lt;/strong> This repository + contains&lt;strong> tools to automatically analyse how participants align + their use of task-specific referents in their dialogue and actions for a + collaborative learning activity, and how it relates to the task + success&lt;/strong> (i.e. their learning outcomes and task performance). As + a use case, it processes data from a collaborative problem solving activity + named JUSThink [1, 2], i.e. JUSThink Dialogue and Actions Corpus data set that + is available from the Zenodo Repository, DOI: 10.5281/zenodo.4627104, and + reproduces the results and figures in [3]. In brief: &lt;strong>JUSThink + Dialogue and Actions Corpus&lt;/strong> contains transcripts, event logs, + and test responses of children aged 9 through 12, as they participate in the + JUSThink activity [1, 2] in pairs of two, to solve a problem on graphs together. + &lt;strong>The JUSThink activity and its study&lt;/strong> is first + described in [1], and elaborated with findings concerning the link between + children's learning, performance in the activity, and perception of self, the + other and the robot in [2]. &lt;strong>Alignment analysis in our work + [3]&lt;/strong> studies the participants' use of expressions that are + related to the task at hand, their follow up actions of these expressions, and + how it links to task success. &lt;strong>Changes in Release + v1.1.0:&lt;/strong> updated with the publication information, finalized + paper structure, research questions and hypotheses as in the published article: + U. Norman*&lt;em>, &lt;/em>T. Dinkar*, B. Bruno, and C. Clavel, + "Studying Alignment in a Collaborative Learning Activity via Automatic Methods: + The Link Between What We Say and Do," Dialogue &amp;amp; Discourse, 13(2), + 1–48. *Contributed equally to this work. 10.5210/dad.2022.201. + &lt;strong>Full Changelog:&lt;/strong> + https://github.com/chili-epfl/justhink-alignment-analysis/compare/v1.0.0...v1.1.0 + &lt;strong>2. Publications&lt;/strong> If you use this work in an + academic context, please cite the following publications: Norman*, U., Dinkar*, + T., Bruno, B., &amp;amp; Clavel, C. (2022). Studying Alignment in a + Collaborative Learning Activity via Automatic Methods: The Link Between What We + Say and Do. Dialogue &amp;amp; Discourse, 13(2), 1–48. *Contributed equally + to this work. https://doi.org/10.5210/dad.2022.201 Norman, U., Dinkar, T., + Bruno, B., &amp;amp; Clavel, C. (2021). JUSThink Alignment Analysis. In + Dialogue &amp;amp; Discourse (v1.1.0, Vol. 13, Number 2, pp. 1–48). Zenodo. + https://doi.org/10.5281/zenodo.6974562 &lt;strong>3. Content&lt;/strong> + The tools provided in this repository consists of 7 Jupyter Notebooks written in + Python 3, and two additional external tools utilised by the notebooks. + &lt;strong>3.1. Jupyter Notebooks&lt;/strong> We highlight that the + notebooks up until the last (i.e. to test the hypotheses + (tools/7_test_the_hypotheses.ipynb)) present a general pipeline to process event + logs, test responses and transcripts to extract measures of task performance, + learning outcomes, and measures of alignment. &lt;strong>Extract task + performance (and other features) from the logs + &lt;/strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb): + Extracts various measures of task behaviour from the logs, at varying + granularities of the activity (i.e. the whole corpus, task, attempt, and turn + levels). In later notebooks, we focus on one of the features to estimate the + task performance of a team: (minimum) error. &lt;strong>Extract learning + outcomes from the test responses&lt;/strong> + (tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts measures of + learning outcomes from the responses to the pre-test and the post-test. In later + notebooks, we focus on one of the features to estimate the learning outcome of a + team: relative learning gain [4] &lt;strong>Select and visualise a subset of + teams for transcription&lt;/strong> + (tools/3_visualise_transcribed_teams.ipynb): Visualises the transcribed teams + among the other teams in the feature space spanned by task performance and + learning outcome, as well as the distribution of their number of attempts and + turns. &lt;strong>Extract routines from transcripts&lt;/strong> + (tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to extract + routines): Extracts routines of referring expressions that are "fixed", i.e. + become shared or established amongst interlocutors. &lt;strong>Combine + transcripts with logs&lt;/strong> + (tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): Merges + transcripts with event logs to have a combined dialogue and actions corpus, to + be processed e.g. to detect follow-up actions. &lt;strong>Recognise + instructions and detect follow-up actions&lt;/strong> + (tools/6_recognise_instructions_detect_follow-up_actions.ipynb): Extracts + verbalised instruction such as "connect Mount Basel to Montreux", and pairs them + with the follow-up action that may &lt;em>match&lt;/em> (e.g. if the + other connects Basel to Montreux) or &lt;em>mismatch&lt;/em> (e.g. if + the other connects Basel to Neuchatel) with the instruction. &lt;strong>Test + the hypotheses &lt;/strong>in [3] (tools/7_test_the_hypotheses.ipynb) (uses + &lt;strong>effsize&lt;/strong> to estimate effect size, specifically + Cliff's Delta): Considers each research questions and hypotheses studied in [3] + and generates the results in [3]. &lt;strong>3.2. External + Tools&lt;/strong> &lt;strong>dialign tool&lt;/strong> to extract + routines, specifically Release 1.0 from dialign-1.0.zip:&lt;br> It extracts + routine expressions that are "shared" among the participants from transcripts. + &lt;br> It is used as an external module (in accordance with its CeCILL-B + License, see &lt;strong>License&lt;/strong>). &lt;strong>effsize + tool&lt;/strong> to compute estimators of effect size.&lt;br> We + specifically use it to compute Cliff's Delta, which quantifies the amount + difference between two groups of observations, by computing the Cliff's Delta + statistic.&lt;br> It is taken from project DABEST (see + &lt;strong>License&lt;/strong>). &lt;strong>4. Research Questions + and Hypotheses in [3]&lt;/strong> &lt;strong>RQ1 Lexical + alignment&lt;/strong>: How do the interlocutors &lt;em>use&lt;/em> + expressions related to the task? Is this associated with task success? + &lt;strong>H1.1&lt;/strong>: Task-specific referents become routine + early for more successful teams. &lt;strong>H1.2&lt;/strong>: Hesitation + phenomena are more likely to occur in the vicinity of priming and establishment + of task-specific referents for more successful teams. &lt;strong>RQ2 + Behavioural alignment&lt;/strong>: How do the interlocutors + &lt;em>follow up&lt;/em> these expressions with actions? Is this + associated with task success? &lt;strong>H2.1&lt;/strong>: Instructions + are more likely to be followed by a corresponding action early in the dialogue + for more successful teams. &lt;strong>H2.2&lt;/strong>: When + instructions are followed by a corresponding or a different action, the action + is more likely to be in the vicinity of information management phenomena for + more successful teams. The RQs and Hs are addressed in the notebook for testing + the hypotheses (i.e. tools/7_test_the_hypotheses.ipynb). + &lt;strong>Acknowledgements&lt;/strong> This project has received + funding from the European Union's Horizon 2020 research and innovation programme + under grant agreement No 765955. Namely, the ANIMATAS Project. + &lt;strong>License&lt;/strong> The whole package is under MIT License, + see the &lt;strong>LICENSE&lt;/strong> file. Classes under the + &lt;strong>tools/effsize&lt;/strong> package were taken from project + &lt;strong>DABEST&lt;/strong>, Copyright 2016-2020 Joses W. Ho. These + classes are licensed under the BSD 3-Clause Clear License. See + &lt;strong>tools/effsize/LICENSE&lt;/strong> file for additional + details. Classes under the &lt;strong>tools/dialign-1.0&lt;/strong> + package were taken from project &lt;strong>dialign&lt;/strong>. These + classes are licensed under the CeCILL-B License. This package is used as an + "external module", see&lt;strong> + tools/dialign-1.0/LICENSE.txt&lt;/strong> for additional + details. + {"references": ["[1] J. Nasir, U. Norman, B. Bruno, and P. Dillenbourg, + \"You Tell, I Do, and We Swap until we Connect All the Gold Mines!,\" ERCIM + News, vol. 2020, no. 120, 2020, [Online]. Available: + https://ercim-news.ercim.eu/en120/special/you-tell-i-do-and-we-swap-until-we-connect-all-the-gold-mines", + "[2] J. Nasir*, U. Norman*, B. Bruno, and P. Dillenbourg, \"When Positive + Perception of the Robot Has No Effect on Learning,\" in 2020 29th IEEE + International Conference on Robot and Human Interactive Communication (RO-MAN), + Aug. 2020, pp. 313\u2013320, doi: 10.1109/RO-MAN47096.2020.9223343", "[3] U. + Norman*, T. Dinkar*, B. Bruno, and C. Clavel, \"Studying Alignment in a + Collaborative Learning Activity via Automatic Methods: The Link Between What We + Say and Do,\" Dialogue &amp;amp; Discourse, vol. 13, no. 2, pp. 1\u201348, + Aug. 2022, doi: 10.5210/dad.2022.201.", "[4] M. Sangin, G. Molinari, M.-A. + N\u00fcssli, and P. Dillenbourg, \"Facilitating peer knowledge modeling: Effects + of a knowledge awareness tool on collaborative learning outcomes and + processes,\"\" Computers in Human Behavior, vol. 27, no. 3, pp. 1059\u20131067, + May 2011, doi: 10.1016/j.chb.2010.05.032."]} + alignment + situated + dialogue + collaborative + learning + spontaneous + speech + disfluency + mutual + understanding + + 2021-04-09 + 2022-08-08 + Zenodo + + + + + + + + + + + true + false + 0.8 + dedup-result-decisiontree-v3 + + + + + doi_dedup___::ae235765bbc422195a6c9f632b2d77eb + + 2104.04429 + + arXiv + + 2022-08-05 + Studying + Alignment in a Collaborative Learning Activity via Automatic Methods: + The Link Between What We Say and Do + + + 10.48550/arxiv.2104.04429 + 10.5210/dad.2022.201 + + + corda__h2020::c4515ebef538a734cf11f795347f5dac + Advancing intuitive human-machine interaction with human-like social + capabilities for education in schools + 765955 + + + ec__________::EC::H2020 + ec__________::EC::H2020::MSCA-ITN-ETN + + ANIMATAS + + + doi_dedup___::0a6314b0ed275d915f5b57a259375691 + 2021-03-22 + Zenodo + 10.5281/zenodo.4627104 + JUSThink Dialogue and Actions Corpus + 10.5281/zenodo.4627103 + + + + + + + Zenodo + 10.5281/zenodo.4675070 + JUSThink Alignment Analysis + 2021-04-09 + + + + 2022-08-08 + Zenodo + 10.5281/zenodo.6974562 + + JUSThink Alignment Analysis (v1.1.0) + + + JUSThink + Alignment Analysis (v1.1.0) + 2022-08-08 + Zenodo + 10.5281/zenodo.4675069 + + + + + + + 2022-08-08 + + 10.5281/zenodo.4675069 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.4675069 + + + + + + + 2022-08-08 + + 10.5281/zenodo.6974562 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.6974562 + + + + + + + 2021-04-09 + + 10.5281/zenodo.4675070 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.4675070 + + + + + + +
+
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index 910a366f6..be2ee7b98 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -2,11 +2,11 @@ - - - - - + + + + + @@ -14,17 +14,16 @@ - - - - + + + - - - - - - + + + + + + @@ -34,18 +33,17 @@ - - - + + - - - + + + - + - + @@ -54,35 +52,36 @@ - + - + - + - - + + - + - - + + + - - - + + + @@ -94,26 +93,29 @@ - + - - + + + + + - - + + - + - + - - + + - + @@ -132,13 +134,15 @@ + - + + - + @@ -156,20 +160,6 @@ - - - - - - - - - - - - - - - + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml index 9331d4ac5..63fc84d75 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -21,7 +21,7 @@ hive_jdbc_url - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 oozie.wf.workflow.notification.url diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index d699b68c3..41c3ed751 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -42,7 +42,9 @@ SELECT p.id, CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, p.callidentifier, p.code, - p.totalcost + p.totalcost, + p.fundedamount, + p.currency FROM ${stats_db_name}.project_tmp p LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np FROM ${stats_db_name}.project_results pr diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index aee66fd5e..24e1a1355 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -59,7 +59,7 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -create table ${stats_db_name}.result_orcid STORED AS PARQUET as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid @@ -69,7 +69,7 @@ from ( LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; -create table ${stats_db_name}.result_result stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -82,7 +82,7 @@ where reltype='resultResult' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; -create table ${stats_db_name}.result_citations_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -97,7 +97,7 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE group by substr(target, 4); -create table ${stats_db_name}.result_references_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 04c7f83b9..86ead4a2c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -42,7 +42,7 @@ join ${stats_db_name}.result res on res.id=r.id; create table ${stats_db_name}.result_apc as select r.id, r.amount, r.currency from ( - select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency + select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 417ed6e4e..1bda07629 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -454,16 +454,16 @@ FROM publication_datasources pd compute stats indi_pub_hybrid_oa_with_cc; create table indi_pub_downloads stored as parquet as -SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id -order by no_dowloads desc; +order by no_downloads desc; compute stats indi_pub_downloads; create table indi_pub_downloads_datasource stored as parquet as -SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id @@ -472,7 +472,7 @@ order by result_id; compute stats indi_pub_downloads_datasource; create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, `year` order by `year` asc; @@ -480,7 +480,7 @@ order by `year` asc; compute stats indi_pub_downloads_year; create table indi_pub_downloads_datasource_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id, `year` diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 290acbf9f..2505c3a34 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -39,7 +39,6 @@ create table TARGET.result stored as parquet as 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) - 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly @@ -224,18 +223,3 @@ create table TARGET.indi_result_with_pid stored as parquet as select * from SOUR --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; ---denorm -alter table TARGET.result rename to TARGET.res_tmp; - -create table TARGET.result_denorm stored as parquet as - select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype - from TARGET.res_tmp r - left outer join TARGET.result_projects rp on rp.id=r.id - left outer join TARGET.result_datasources rd on rd.id=r.id - left outer join TARGET.project p on p.id=rp.project - left outer join TARGET.datasource d on d.id=rd.datasource; -compute stats TARGET.result_denorm; - -alter table TARGET.result_denorm rename to TARGET.result; -drop table TARGET.res_tmp; ---- done! \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 5461afde6..c31180c14 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -48,7 +48,9 @@ CREATE TABLE ${stats_db_name}.project_tmp delayedpubs INT, callidentifier STRING, code STRING, - totalcost FLOAT + totalcost FLOAT, + fundedamount FLOAT, + currency STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); INSERT INTO ${stats_db_name}.project_tmp @@ -72,7 +74,9 @@ SELECT substr(p.id, 4) AS id, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code, - p.totalcost AS totalcost + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + p.currency.value AS currency FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;