diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index acac3594fa..b1494f649a 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -21,6 +21,10 @@ org.apache.hadoop hadoop-common + + commons-validator + commons-validator + org.apache.spark spark-core_2.11 diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 15fff07c02..da253c681a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -7,11 +7,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.GenericValidator; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class GraphCleaningFunctions extends CleaningFunctions { @@ -115,7 +117,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { o.setCountry(ModelConstants.UNKNOWN_COUNTRY); } } else if (value instanceof Relation) { - // nothing to clean here + Relation r = (Relation) value; + + if (!isValidDate(r.getValidationDate())) { + r.setValidationDate(null); + r.setValidated(false); + } + } else if (value instanceof Result) { Result r = (Result) value; @@ -292,6 +300,12 @@ public class GraphCleaningFunctions extends CleaningFunctions { return value; } + protected static boolean isValidDate(String date) { + return Stream + .of(ModelSupport.DATE_TIME_FORMATS) + .anyMatch(format -> GenericValidator.isDate(date, format, false)); + } + // HELPERS private static boolean isValidAuthorName(Author a) { diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index 7256d6489f..e8135f2019 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; +import java.time.format.DateTimeParseException; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; @@ -15,16 +16,23 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; public class OafMapperUtilsTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @Test + public void testDateValidation() { + + assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z")); + assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52")); + assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05")); + assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05")); + + } + @Test public void testMergePubs() throws IOException { Publication p1 = read("publication_1.json", Publication.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index d6101ba7a9..931ac06f64 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -56,6 +56,7 @@ object ImportDatacite { val hdfsTargetPath = new Path(targetPath) log.info(s"hdfsTargetPath is $hdfsTargetPath") + val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt val spkipImport = parser.get("skipImport") log.info(s"skipImport is $spkipImport") @@ -110,7 +111,7 @@ object ImportDatacite { println(s"last Timestamp is $ts") - val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf) + val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs) println(s"Imported from Datacite API $cnt documents") @@ -137,7 +138,7 @@ object ImportDatacite { } } - private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = { + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = { var from:Long = timestamp * 1000 val delta:Long = 50000000L var client: DataciteAPIImporter = null @@ -148,7 +149,7 @@ object ImportDatacite { try { var start: Long = System.currentTimeMillis while (from < now) { - client = new DataciteAPIImporter(from, 100, from + delta) + client = new DataciteAPIImporter(from, bs, from + delta) var end: Long = 0 val key: IntWritable = new IntWritable(i) val value: Text = new Text diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index e5a79300e5..760e5131db 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -143,7 +143,6 @@ public class PrepareProgramme { JavaRDD h2020Programmes = programme .toJavaRDD() - .filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020")) .mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme)) .reduceByKey((a, b) -> { if (!a.getLanguage().equals("en")) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java index 3ef98e0215..cecd537ba0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java @@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -32,7 +31,6 @@ public class PrepareProjects { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final HashMap programmeMap = new HashMap<>(); public static void main(String[] args) throws Exception { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index a583b7bfa2..fdc12c6629 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -120,7 +120,6 @@ public class SparkAtomicActionJob { .map((MapFunction, Project>) c -> { CSVProject csvProject = c._1(); - Optional ocsvProgramme = Optional.ofNullable(c._2()); return Optional .ofNullable(c._2()) @@ -135,9 +134,9 @@ public class SparkAtomicActionJob { H2020Programme pm = new H2020Programme(); H2020Classification h2020classification = new H2020Classification(); pm.setCode(csvProject.getProgramme()); - h2020classification.setClassification(ocsvProgramme.get().getClassification()); + h2020classification.setClassification(csvProgramme.getClassification()); h2020classification.setH2020Programme(pm); - setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short()); + setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short()); // setProgramme(h2020classification, ocsvProgramme.get().getClassification()); pp.setH2020classification(Arrays.asList(h2020classification)); @@ -145,10 +144,11 @@ public class SparkAtomicActionJob { }) .orElse(null); - }, Encoders.bean(Project.class)); + }, Encoders.bean(Project.class)) + .filter(Objects::nonNull); aaproject - .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code"))) + .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left") .map((MapFunction, Project>) p -> { Optional op = Optional.ofNullable(p._2()); Project rp = p._1(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java index f991a4297b..d486f01049 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java @@ -7,14 +7,7 @@ import java.io.Serializable; * The model for the programme csv file */ public class CSVProgramme implements Serializable { - private String parentProgramme; - private String frameworkProgramme; - private String startDate; - private String endDate; - private String objective; - private String subjects; - private String legalBasis; - private String call; + private String rcn; private String code; @@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable { this.language = language; } - public String getParentProgramme() { - return parentProgramme; - } - - public void setParentProgramme(String parentProgramme) { - this.parentProgramme = parentProgramme; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - - public String getLegalBasis() { - return legalBasis; - } - - public void setLegalBasis(String legalBasis) { - this.legalBasis = legalBasis; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } +// } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index 1a6ebb9e86..5f5b61d8b2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -26,7 +26,6 @@ public class EXCELParser { throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, InvalidFormatException { - // OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); OPCPackage pkg = OPCPackage.open(file); XSSFWorkbook wb = new XSSFWorkbook(pkg); @@ -58,7 +57,6 @@ public class EXCELParser { for (int i = 0; i < headers.size(); i++) { Cell cell = row.getCell(i); - String value = dataFormatter.formatCellValue(cell); FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json index 69fb039ba8..a37ae4bba0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json @@ -18,6 +18,12 @@ "paramDescription": "avoid to downlaod new items but apply the previous update", "paramRequired": false }, + { + "paramName": "bs", + "paramLongName": "blocksize", + "paramDescription": "define the requests block size", + "paramRequired": false + }, { "paramName": "n", "paramLongName": "namenode", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml index 8ce5818851..e4f2715fb3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + projectFileURL @@ -18,6 +18,10 @@ outputPath path where to store the action set + + sheetName + the name of the sheet to read + @@ -31,10 +35,23 @@ - + + + + + + + + + + + + + + eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV @@ -43,7 +60,7 @@ --hdfsPath${workingDir}/projects --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProject - + @@ -55,7 +72,7 @@ --hdfsPath${workingDir}/programme --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme - + @@ -68,7 +85,7 @@ --sheetName${sheetName} --classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic - + @@ -81,7 +98,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} - + @@ -105,10 +122,15 @@ --programmePath${workingDir}/programme --outputPath${workingDir}/preparedProgramme - + + + + + + yarn @@ -130,7 +152,7 @@ --outputPath${workingDir}/preparedProjects --dbProjectPath${workingDir}/dbProjects - + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 1601d9b3ee..b7155bc3a4 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -20,8 +20,8 @@ import eu.dnetlib.dhp.collection.HttpConnector2; public class EXCELParserTest { private static Path workingDir; - private final HttpConnector2 httpConnector = new HttpConnector2(); - private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; + private HttpConnector2 httpConnector = new HttpConnector2(); + private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx"; @BeforeAll public static void beforeAll() throws IOException { @@ -35,11 +35,12 @@ public class EXCELParserTest { EXCELParser excelParser = new EXCELParser(); - final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"; - final String sheetName = "Topics"; - List pl = excelParser.parse(httpConnector.getInputSourceAsStream(URL), classForName, sheetName); + List pl = excelParser + .parse( + httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic", + "Topics"); - Assertions.assertEquals(3837, pl.size()); + Assertions.assertEquals(3878, pl.size()); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz index 01e804ff5b..620e1abfbf 100644 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz index 8b1982deea..71f132ad16 100644 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz differ diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index ecbfd821ea..6f0a522446 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource; import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.broker.objects.OaBrokerTypedValue; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Datasource; @@ -144,7 +145,7 @@ public class ConversionUtils { .filter(pid -> pid != null) .filter(pid -> pid.getQualifier() != null) .filter(pid -> pid.getQualifier().getClassid() != null) - .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) + .filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID)) .map(pid -> pid.getValue()) .map(pid -> cleanOrcid(pid)) .filter(StringUtils::isNotBlank) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index a816ca991c..2157538990 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -93,7 +93,7 @@ public class PublicationToOaf implements Serializable { { put( ModelConstants.ORCID, - new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid")); + new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID)); } }; @@ -126,8 +126,6 @@ public class PublicationToOaf implements Serializable { } } - public static final String PID_TYPES = "dnet:pid_types"; - public Oaf generatePublicationActionsFromJson(final String json) { if (parsedPublications != null) { parsedPublications.add(1); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java index 00f0dd01c7..6dca09b632 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/Constants.java @@ -24,8 +24,6 @@ public class Constants { public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative"; - public static String ORCID = "orcid"; - static { accessRightsCoarMap.put("OPEN", "c_abf2"); accessRightsCoarMap.put("RESTRICTED", "c_16ec"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index cb052ebaa8..d30b3122c0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -503,7 +503,7 @@ public class ResultMapper implements Serializable { private static Pid getOrcid(List p) { for (StructuredProperty pid : p) { - if (pid.getQualifier().getClassid().equals(Constants.ORCID)) { + if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) { Optional di = Optional.ofNullable(pid.getDataInfo()); if (di.isPresent()) { return Pid diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 1beb616ba0..f441f2c2ae 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -76,7 +76,7 @@ public abstract class AbstractMdRecordToOafMapper { protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/"; protected static final Qualifier ORCID_PID_TYPE = qualifier( - "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); + ORCID_PENDING, ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES); protected static final Qualifier MAG_PID_TYPE = qualifier( "MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 789f8a42b1..06aeab3451 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -19,6 +19,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; @@ -56,7 +57,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { author.setPid(new ArrayList<>()); if (StringUtils.isNotBlank(pid)) { - if (type.startsWith("ORCID")) { + if (type.toLowerCase().startsWith(ORCID)) { final String cleanedId = pid .replaceAll("http://orcid.org/", "") .replaceAll("https://orcid.org/", ""); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 9b0cf7a644..b7400873b6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -13,6 +13,7 @@ import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; @@ -85,7 +86,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { .replaceAll(" ", "") .replaceAll("_", ""); - if (type.startsWith("ORCID")) { + if (type.toLowerCase().startsWith(ORCID)) { final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); } else if (type.startsWith("MAGID")) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 533237e5a1..08a596db5c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -99,8 +99,8 @@ public class MappersTest { .findFirst() .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); - assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); + assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Votsi,Nefta", author.get().getFullname()); @@ -280,8 +280,8 @@ public class MappersTest { .findFirst() .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); - assertEquals("ORCID", pid.getQualifier().getClassid()); - assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); + assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); assertEquals("Baracchini, Theo", author.get().getFullname()); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 526c1b5f44..86bbae99ec 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1160,6 +1160,27 @@ public class XmlRecordFactory implements Serializable { .asXmlElement( "distributionlocation", instance.getDistributionlocation())); } + if (instance.getPid() != null) { + fields + .addAll( + instance + .getPid() + .stream() + .filter(Objects::nonNull) + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + if (instance.getAlternateIdentifier() != null) { + fields + .addAll( + instance + .getAlternateIdentifier() + .stream() + .filter(Objects::nonNull) + .map(p -> XmlSerializationUtils.mapStructuredProperty("alternateidentifier", p)) + .collect(Collectors.toList())); + } + if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) { fields .add( diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 75805f66c7..6631cb4da6 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -61,6 +61,11 @@ public class XmlRecordFactoryTest { Assertions.assertEquals("0000-0001-9613-9956", doc.valueOf("//creator[@rank = '2']/@orcid")); Assertions.assertEquals("", doc.valueOf("//creator[@rank = '2']/@orcid_pending")); + Assertions.assertEquals("doi", doc.valueOf("//instance/pid/@classid")); + Assertions.assertEquals("10.1109/TED.2018.2853550", doc.valueOf("//instance/pid/text()")); + + Assertions.assertEquals("doi", doc.valueOf("//instance/alternateidentifier/@classid")); + Assertions.assertEquals("10.5689/LIB.2018.2853550", doc.valueOf("//instance/alternateidentifier/text()")); // TODO add assertions based of values extracted from the XML record } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json index ea7a300513..91f1598535 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json @@ -284,6 +284,54 @@ "id": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c", "instance": [ { + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1109/TED.2018.2853550" + } + ], + "alternateIdentifier": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.5689/LIB.2018.2853550" + } + ], "accessright": { "classid": "OPEN", "classname": "Open Access", diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 0d8ff7ee32..8286e50391 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -16,6 +16,14 @@ monitor_db_production_name the name of the monitor public database + + observatory_db_name + the monitor database name + + + observatory_db_production_name + the name of the monitor public database + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache promote. @@ -77,6 +85,19 @@ ${monitor_db_production_name} updateProductionViews.sh + + + + + + + ${jobTracker} + ${nameNode} + updateProductionViews.sh + ${observatory_db_name} + ${observatory_db_production_name} + updateProductionViews.sh + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh new file mode 100644 index 0000000000..ff03bca038 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating observatory database" +impala-shell -q "drop database if exists ${TARGET} cascade" +impala-shell -q "create database if not exists ${TARGET}" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow observatory database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 51d3a73c9e..47d147f756 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -45,35 +45,3 @@ FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct; - - -------------------------------------------------------------------------------- --- To see with Antonis if the following is needed and where it should be placed -------------------------------------------------------------------------------- -CREATE TABLE ${stats_db_name}.numbers_country AS -SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications -FROM ${stats_db_name}.result r, - ${stats_db_name}.result_datasources rd, - ${stats_db_name}.datasource d, - ${stats_db_name}.datasource_organizations dor, - ${stats_db_name}.organization org -WHERE r.id = rd.id - AND rd.datasource = d.id - AND d.id = dor.id - AND dor.organization = org.id - AND r.type = 'publication' - AND r.bestlicence = 'Open Access' -GROUP BY org.country; - --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql index 833deff734..481fd9e8c2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql @@ -59,33 +59,4 @@ from result_gold union all select distinct r.id, false as gold from ${stats_db_name}.result r -where r.id not in (select id from result_gold); - --- shortcut result-country through the organization affiliation -create table ${stats_db_name}.result_affiliated_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_organization ro on ro.id=r.id -join ${stats_db_name}.organization o on o.id=ro.organization -where o.country is not null and o.country!=''; - --- shortcut result-country through datasource of deposition -create table ${stats_db_name}.result_deposited_country as -select r.id as id, o.country as country -from ${stats_db_name}.result r -join ${stats_db_name}.result_datasources rd on rd.id=r.id -join ${stats_db_name}.datasource d on d.id=rd.datasource -join ${stats_db_name}.datasource_organizations dor on dor.id=d.id -join ${stats_db_name}.organization o on o.id=dor.organization -where o.country is not null and o.country!=''; - --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where r.id not in (select id from result_gold); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index 2bdc263ef5..f737c1ea61 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -52,7 +52,4 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; drop table if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; --- --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +drop table ${stats_db_name}.result_tmp; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 9477ada12f..af5e2a6a4d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -19,9 +19,6 @@ create table TARGET.result as select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo; compute stats TARGET.result; -create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id); -compute stats TARGET.result_affiliated_country; - create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_citations; @@ -34,9 +31,6 @@ compute stats TARGET.result_concepts; create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_datasources; -create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_deposited_country; - create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_fundercount; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql new file mode 100644 index 0000000000..40cdf3f6d9 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -0,0 +1,259 @@ +create table TARGET.result_affiliated_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_affiliated_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_affiliated_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_affiliated_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_affiliated_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_datasources rd on rd.id=r.id +left outer join SOURCE.datasource d on d.id=rd.datasource +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_affiliated_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_affiliated_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_affiliated_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_affiliated_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_organization ro on ro.id=r.id +join SOURCE.organization o on o.id=ro.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +create table TARGET.result_deposited_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name; + +create table TARGET.result_deposited_year stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year; + +create table TARGET.result_deposited_year_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name; + +create table TARGET.result_deposited_datasource stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name; + +create table TARGET.result_deposited_datasource_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name; + +create table TARGET.result_deposited_organization stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name; + +create table TARGET.result_deposited_organization_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name; + +create table TARGET.result_deposited_funder stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder; + +create table TARGET.result_deposited_funder_country stored as parquet as +select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence, + case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, + r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname +from SOURCE.result r +join SOURCE.result_datasources rd on rd.id=r.id +join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository') +join SOURCE.datasource_organizations dor on dor.id=d.id +join SOURCE.organization o on o.id=dor.organization +join SOURCE.country c on c.code=o.country and c.continent_name='Europe' +join SOURCE.result_projects rp on rp.id=r.id +join SOURCE.project p on p.id=rp.project +left outer join SOURCE.result_licenses rl on rl.id=r.id +left outer join SOURCE.result_pids pids on pids.id=r.id +group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name; + +compute stats TARGET.result_affiliated_country; +compute stats TARGET.result_affiliated_year; +compute stats TARGET.result_affiliated_year_country; +compute stats TARGET.result_affiliated_datasource; +compute stats TARGET.result_affiliated_datasource_country; +compute stats TARGET.result_affiliated_organization; +compute stats TARGET.result_affiliated_organization_country; +compute stats TARGET.result_affiliated_funder; +compute stats TARGET.result_affiliated_funder_country; +compute stats TARGET.result_deposited_country; +compute stats TARGET.result_deposited_year; +compute stats TARGET.result_deposited_year_country; +compute stats TARGET.result_deposited_datasource; +compute stats TARGET.result_deposited_datasource_country; +compute stats TARGET.result_deposited_organization; +compute stats TARGET.result_deposited_organization_country; +compute stats TARGET.result_deposited_funder; +compute stats TARGET.result_deposited_funder_country; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 321500e2c2..824a8b3c7d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -25,6 +25,14 @@ monitor_db_shadow_name the name of the shadow monitor db + + observatory_db_name + the target monitor db name + + + observatory_db_shadow_name + the name of the shadow monitor db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -305,11 +313,26 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh - + - + + + ${jobTracker} + ${nameNode} + observatory.sh + ${stats_db_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + ${wf:appPath()}/scripts/step21-createObservatoryDB.sql + observatory.sh + + + + + + ${jobTracker} ${nameNode} @@ -322,4 +345,4 @@ - + \ No newline at end of file diff --git a/pom.xml b/pom.xml index b4acd6e68e..5b96816d96 100644 --- a/pom.xml +++ b/pom.xml @@ -200,6 +200,12 @@ ${dhp.commons.lang.version} + + commons-validator + commons-validator + 1.7 + + com.google.guava guava @@ -730,7 +736,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.4.7] + [2.5.11] [4.0.3] [6.0.5] [3.1.6]