diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index 17482c019..0b602b774 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; @@ -32,27 +33,33 @@ public class AuthorMerger { } - public static List mergeAuthor(final List a, final List b) { + public static List mergeAuthor(final List a, final List b, Double threshold) { int pa = countAuthorsPids(a); int pb = countAuthorsPids(b); List base, enrich; int sa = authorsSize(a); int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { + if (sa == sb) { base = pa > pb ? a : b; enrich = pa > pb ? b : a; + } else { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; } - enrichPidFromList(base, enrich); + enrichPidFromList(base, enrich, threshold); return base; } - private static void enrichPidFromList(List base, List enrich) { + public static List mergeAuthor(final List a, final List b) { + return mergeAuthor(a, b, THRESHOLD); + } + + private static void enrichPidFromList(List base, List enrich, Double threshold) { if (base == null || enrich == null) return; + + // (if an Author has more than 1 pid, it appears 2 times in the list) final Map basePidAuthorMap = base .stream() .filter(a -> a.getPid() != null && a.getPid().size() > 0) @@ -63,6 +70,7 @@ public class AuthorMerger { .map(p -> new Tuple2<>(pidToComparableString(p), a))) .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + // (list of pid that are missing in the other list) final List> pidToEnrich = enrich .stream() .filter(a -> a.getPid() != null && a.getPid().size() > 0) @@ -83,10 +91,10 @@ public class AuthorMerger { .max(Comparator.comparing(Tuple2::_1)); if (simAuthor.isPresent()) { - double th = THRESHOLD; + double th = threshold; // increase the threshold if the surname is too short if (simAuthor.get()._2().getSurname() != null - && simAuthor.get()._2().getSurname().length() <= 3) + && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) th = 0.99; if (simAuthor.get()._1() > th) { @@ -156,7 +164,7 @@ public class AuthorMerger { } private static String normalize(final String s) { - return nfd(s) + String[] normalized = nfd(s) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError // in case @@ -166,7 +174,12 @@ public class AuthorMerger { .replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\d)+", " ") .replaceAll("(\\n)+", " ") - .trim(); + .trim() + .split(" "); + + Arrays.sort(normalized); + + return String.join(" ", normalized); } private static String nfd(final String s) { diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java new file mode 100644 index 000000000..9c4e62214 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java @@ -0,0 +1,100 @@ + +package eu.dnetlib.dhp.oa.merge; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +public class AuthorMergerTest { + + private String publicationsBasePath; + + private List> authors; + + @BeforeEach + public void setUp() throws Exception { + + publicationsBasePath = Paths + .get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI()) + .toFile() + .getAbsolutePath(); + + authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class) + .stream() + .map(p -> p._2().getAuthor()) + .collect(Collectors.toList()); + + } + + @Test + public void mergeTest() { // used in the dedup: threshold set to 0.95 + + for (List authors1 : authors) { + System.out.println("List " + (authors.indexOf(authors1) + 1)); + for (Author author : authors1) { + System.out.println(authorToString(author)); + } + } + + List merge = AuthorMerger.merge(authors); + + System.out.println("Merge "); + for (Author author : merge) { + System.out.println(authorToString(author)); + } + + Assertions.assertEquals(7, merge.size()); + + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res + .add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + + public String authorToString(Author a) { + + String print = "Fullname = "; + print += a.getFullname() + " pid = ["; + if (a.getPid() != null) + for (StructuredProperty sp : a.getPid()) { + print += sp.toComparableString() + " "; + } + print += "]"; + return print; + } +} diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json new file mode 100644 index 000000000..600181ba5 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json @@ -0,0 +1,3 @@ +{ "journal":{ "dataInfo":null, "conferenceplace":null, "issnPrinted":"0009-9260", "issnOnline":null, "issnLinking":null, "ep":"636", "iss":null, "sp":"632", "vol":"55", "edition":null, "conferencedate":null, "name":"Clinical Radiology" }, "measures":null, "author":[ { "rank":null, "fullname":"KARL TURETSCHEK", "affiliation":null, "pid":null, "surname":"TURETSCHEK", "name":"KARL" }, { "rank":null, "fullname":"WOLFGANG EBNER", "affiliation":null, "pid":null, "surname":"EBNER", "name":"WOLFGANG" }, { "rank":null, "fullname":"DOMINIK FLEISCHMANN", "affiliation":null, "pid":null, "surname":"FLEISCHMANN", "name":"DOMINIK" }, { "rank":null, "fullname":"PATRICK WUNDERBALDINGER", "affiliation":null, "pid":null, "surname":"WUNDERBALDINGER", "name":"PATRICK" }, { "rank":null, "fullname":"LUDWIG ERLACHER", "affiliation":null, "pid":null, "surname":"ERLACHER", "name":"LUDWIG" }, { "rank":null, "fullname":"THOMAS ZONTSICH", "affiliation":null, "pid":null, "surname":"ZONTSICH", "name":"THOMAS" }, { "rank":null, "fullname":"ALEXANDER A. BANKIER", "affiliation":null, "pid":null, "surname":"BANKIER", "name":"ALEXANDER A." } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication"}, "title":[ { "qualifier":{ "classid":"main title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"main title" }, "dataInfo":null, "value":"Early Pulmonary Involvement in Ankylosing Spondylitis: Assessment With Thin-section CT" } ], "relevantdate":[ { "qualifier":{ "classid":"created", "schemeid":"dnet:dataCite_date", "schemename":"dnet:dataCite_date", "classname":"created" }, "dataInfo":null, "value":"2002-09-19T13:54:50Z" } ], "dateofacceptance":{ "dataInfo":null, "value":"2002-09-19T13:54:50Z" }, "publisher":{ "dataInfo":null, "value":"Elsevier BV" }, "embargoenddate":null, "fulltext":null, "contributor":null, "resourcetype":{ "classid":"0001", "schemeid":"dnet:dataCite_resource", "schemename":"dnet:dataCite_resource", "classname":"0001"}, "coverage":null, "bestaccessright":null, "externalReference":null, "format":null, "description":[ ], "source":[ { "dataInfo":null, "value":"Crossref" } ], "subject":[ { "qualifier":{ "classid":"keywords", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"keywords" }, "dataInfo":null, "value":"Radiology Nuclear Medicine and imaging" }, { "qualifier":{ "classid":"keywords", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"keywords" }, "dataInfo":null, "value":"General Medicine" } ], "language":null, "instance":[ { "processingchargecurrency":null, "refereed":null, "instancetype":{ "classid":"0001", "schemeid":"dnet:publication_resource", "schemename":"dnet:publication_resource", "classname":"Article" }, "hostedby":null, "distributionlocation":null, "processingchargeamount":null, "license":{ "dataInfo":null, "value":"https://www.elsevier.com/tdm/userlicense/1.0/" }, "accessright":{ "classid":"RESTRICTED", "schemeid":"dnet:access_modes", "schemename":"dnet:access_modes", "classname":"Restricted" }, "dateofacceptance":{ "dataInfo":null, "value":"2002-09-19T13:54:50Z" }, "collectedfrom":{ "dataInfo":null, "value":"Crossref", "key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" }, "url":[ "https://api.elsevier.com/content/article/PII:S0009926000904987?httpAccept=text/xml", "https://api.elsevier.com/content/article/PII:S0009926000904987?httpAccept=text/plain", "http://dx.doi.org/10.1053/crad.2000.0498" ] } ], "context":null, "country":null, "originalId":[ "S0009926000904987", "10.1053/crad.2000.0498" ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi" }, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":"2020-02-06T20:40:22Z", "dateoftransformation":null, "oaiprovenance":null, "extraInfo":null, "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"Crossref", "key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "lastupdatetimestamp":1581021622595 } +{ "journal":null, "measures":null, "author":[ { "rank":null, "fullname":"Dominik Fleischmann", "affiliation":null, "pid":[ { "qualifier":{ "classid":"ORCID", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"ORCID" }, "dataInfo":{ "trust":"0.91", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:crosswalk:entityregistry", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"Harvested"} }, "value":"0000-0003-0715-0952" } ], "surname":"Fleischmann", "name":"Dominik" } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication"}, "title":[ ], "relevantdate":[ ], "dateofacceptance":null, "publisher":null, "embargoenddate":null, "fulltext":[ ], "contributor":[ ], "resourcetype":null, "coverage":[ ], "bestaccessright":null, "externalReference":[ ], "format":[ ], "description":null, "source":[ ], "subject":[ ], "language":null, "instance":[ ], "context":[ ], "country":[ ], "originalId":[ ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi"}, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":null, "dateoftransformation":null, "oaiprovenance":null, "extraInfo":[ ], "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"ORCID", "key":"10|openaire____::806360c771262b4d6770e7cdf04b5c5a" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "lastupdatetimestamp":null } +{ "journal":{ "dataInfo":null, "conferenceplace":null, "issnPrinted":"0009-9260", "issnOnline":null, "issnLinking":null, "ep":"636", "iss":"8", "sp":"632", "vol":"55", "edition":null, "conferencedate":null, "name":"Clinical Radiology" }, "measures":null, "author":[ { "rank":null, "fullname":"T. Zontsich", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/1966908432" } ], "surname":null, "name":null }, { "rank":null, "fullname":"L Erlacher", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/687931320" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Dominik Fleischmann", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2156559961" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Alexander A. Bankier", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/1107971609" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Patrick Wunderbaldinger", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2422340537" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Wolfgang Ebner", "affiliation":null, "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2186462571" } ], "surname":null, "name":null }, { "rank":null, "fullname":"K. Turetschek", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/321765676" } ], "surname":null, "name":null } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication" }, "title":[ { "qualifier":{ "classid":"main title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"main title" }, "dataInfo":null, "value":"early pulmonary involvement in ankylosing spondylitis assessment with thin section ct" }, { "qualifier":{ "classid":"alternative title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"alternative title" }, "dataInfo":null, "value":"Early pulmonary involvement in ankylosing spondylitis: assessment with thin-section CT." } ], "relevantdate":null, "dateofacceptance":{ "dataInfo":null, "value":"2000-08-01" }, "publisher":{ "dataInfo":null, "value":"Elsevier" }, "embargoenddate":null, "fulltext":null, "contributor":null, "resourcetype":null, "coverage":null, "bestaccessright":null, "externalReference":null, "format":null, "description":[ { "dataInfo":null, "value":"Abstract AIM: To determine the frequency and the distribution of early pulmonary lesions in patients with ankylosing spondylitis (AS) and a normal chest X-ray on thin-section CT and to correlate the CT findings with the results of pulmonary function tests and clinical data. MATERIALS AND METHODS: Twenty-five patients with clinically proven AS and no history of smoking underwent clinical examinations, pulmonary function tests (PFT), chest radiography, and thin-section CT. Four of 25 patients (16%), who had obvious signs on plain films suggestive of pre-existing disorders unrelated to AS were excluded. RESULTS: Fifteen of 21 patients (71%) had abnormalities on thin-section CT. The most frequent abnormalities were thickening of the interlobular septa in seven of 21 patients (33%), mild bronchial wall thickening in (6/21, 29%), pleural thickening and pleuropulmonary irregularities (both 29%) and linear septal thickening (6/21, 29%). In six patients there were no signs of pleuropulmonary involvement. Eight of 15 patients (53%) with abnormal and four of six patients (67%) with normal CT findings revealed mild restrictive lung function impairment. CONCLUSION: Patients with AS but a normal chest radiograph frequently have abnormalities on thin-section CT. As these abnormalities are usually subtle and their extent does not correlate with functional and clinical data, the overall routine impact of thin-section CT in the diagnosis of AS is limited. Turetschek, K , (2000) Clinical Radiology53, 632–636." } ], "source":[ { "dataInfo":null, "value":null } ], "subject":[ { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Complication" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Chest radiograph" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.580897", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.diagnostic_test" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.580897", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"In patient" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Radiography" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4582326", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business.industry" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4582326", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Thin section ct" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Respiratory disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49358836", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49358836", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Ankylosing spondylitis" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49937168", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49937168", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Radiology" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4573571", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.medical_specialty" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4573571", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.40295774", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business.industry" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.40295774", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Pulmonary function testing" } ], "language":null, "instance":[ { "processingchargecurrency":null, "refereed":null, "instancetype":null, "hostedby":null, "distributionlocation":null, "processingchargeamount":null, "license":null, "accessright":null, "dateofacceptance":null, "collectedfrom":{ "dataInfo":null, "value":"Microsoft Academic Graph", "key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a" }, "url":[ "https://www.ncbi.nlm.nih.gov/pubmed/10964736", "https://www.sciencedirect.com/science/article/pii/S0009926000904987", "https://academic.microsoft.com/#/detail/1990704599" ] } ], "context":null, "country":null, "originalId":[ "1990704599", "10.1053/crad.2000.0498" ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi" }, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":null, "dateoftransformation":null, "oaiprovenance":null, "extraInfo":null, "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"Microsoft Academic Graph", "key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset"} }, "lastupdatetimestamp":null } \ No newline at end of file diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index e72a0d69c..10b2c7418 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -108,6 +108,8 @@ public class ModelConstants { public static final KeyValue UNKNOWN_REPOSITORY = keyValue( "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); + public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE); + private static Qualifier qualifier( final String classid, final String classname, diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index b5bca2e93..0c7903137 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -3,6 +3,9 @@ package eu.dnetlib.dhp.schema.common; import static com.google.common.base.Preconditions.checkArgument; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -473,4 +476,25 @@ public class ModelSupport { private static String idFnForOafEntity(T t) { return ((OafEntity) t).getId(); } + + public static final String ISO8601FORMAT = "yyyy-MM-dd'T'HH:mm:ssZ"; + + public static String oldest(String dateA, String dateB) throws ParseException { + + if (StringUtils.isBlank(dateA)) { + return dateB; + } + if (StringUtils.isBlank(dateB)) { + return dateA; + } + if (StringUtils.isNotBlank(dateA) && StringUtils.isNotBlank(dateB)) { + + final Date a = new SimpleDateFormat(ISO8601FORMAT).parse(dateA); + final Date b = new SimpleDateFormat(ISO8601FORMAT).parse(dateB); + + return a.before(b) ? dateA : dateB; + } else { + return null; + } + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java index 78a4c6801..c0c14d10d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; -import java.io.Serializable; import java.util.List; import com.google.common.base.Objects; @@ -9,7 +8,7 @@ import com.google.common.base.Objects; /** * Represent a measure, must be further described by a system available resource providing name and descriptions. */ -public class Measure implements Serializable { +public class Measure { /** * Unique measure identifier. @@ -17,7 +16,7 @@ public class Measure implements Serializable { private String id; /** - * List of units associated with this measure. KeyValue provides a pair to store the label (key) and the value, plus + * List of units associated with this measure. KeyValue provides a pair to store the laber (key) and the value, plus * common provenance information. */ private List unit; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java index f4f5baa7b..494123fdf 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java @@ -62,8 +62,6 @@ public abstract class Oaf implements Serializable { .distinct() // relies on KeyValue.equals .collect(Collectors.toList())); - mergeOAFDataInfo(o); - setLastupdatetimestamp( Math .max( diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java index b698c957d..4be4d5d30 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java @@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable { ? p.getFundedamount() : fundedamount; - // programme = mergeLists(programme, p.getProgramme()); - h2020classification = mergeLists(h2020classification, p.getH2020classification()); mergeOAFDataInfo(e); diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 0de34dbec..8825d7137 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,8 +1,11 @@ package eu.dnetlib.dhp.schema.oaf; +import eu.dnetlib.dhp.schema.common.ModelSupport; + import static com.google.common.base.Preconditions.checkArgument; +import java.text.ParseException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -106,7 +109,7 @@ public class Relation extends Oaf { } public Boolean getValidated() { - return validated; + return Objects.nonNull(validated) && validated; } public void setValidated(Boolean validated) { @@ -130,6 +133,13 @@ public class Relation extends Oaf { Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); + setValidated(getValidated() || r.getValidated()); + try { + setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate())); + } catch (ParseException e) { + throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate())); + } + super.mergeFrom(r); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 443c18230..845c4c982 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable { Result r = (Result) e; - // TODO consider merging also Measures + measures = mergeLists(measures, r.getMeasures()); instance = mergeLists(instance, r.getInstance()); @@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable { if (a.size() == b.size()) { int msa = a .stream() - .filter(i -> i.getValue() != null) + .filter(i -> i != null && i.getValue() != null) .map(i -> i.getValue().length()) .max(Comparator.naturalOrder()) .orElse(0); int msb = b .stream() - .filter(i -> i.getValue() != null) + .filter(i -> i != null && i.getValue() != null) .map(i -> i.getValue().length()) .max(Comparator.naturalOrder()) .orElse(0); diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index f91646f2c..6ee5b9d85 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -63,6 +63,51 @@ public class MergeTest { assertEquals(3, a.getSubject().size()); } + @Test + public void mergeRelationTest() { + + Relation a = createRel(null, null); + Relation b = createRel(null, null); + a.mergeFrom(b); + assertEquals(a, b); + + a = createRel(true, null); + b = createRel(null, null); + a.mergeFrom(b); + assertEquals(true, a.getValidated()); + + a = createRel(true, null); + b = createRel(false, null); + a.mergeFrom(b); + assertEquals(true, a.getValidated()); + + a = createRel(true, null); + b = createRel(true, "2016-04-05T12:41:19.202Z"); + a.mergeFrom(b); + assertEquals("2016-04-05T12:41:19.202Z", a.getValidationDate()); + } + + @Test + public void mergeRelationTestParseException() { + assertThrows(IllegalArgumentException.class, () -> { + Relation a = createRel(true, "2016-04-05"); + Relation b = createRel(true, "2016-04-05"); + a.mergeFrom(b); + }); + } + + private Relation createRel(Boolean validated, String validationDate) { + Relation rel = new Relation(); + rel.setSource("1"); + rel.setTarget("2"); + rel.setRelType("reltype"); + rel.setSubRelType("subreltype"); + rel.setRelClass("relclass"); + rel.setValidated(validated); + rel.setValidationDate(validationDate); + return rel; + } + private KeyValue setKV(final String key, final String value) { KeyValue k = new KeyValue(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java index 50bda898c..2cd37d9ea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -125,16 +125,17 @@ public class SparkAtomicActionScoreJob implements Serializable { return ret; }, Encoders.bean(BipScore.class)) .groupByKey((MapFunction) value -> value.getId(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k, it) -> { - Result ret = inputClazz.newInstance(); + .mapGroups((MapGroupsFunction) (k, it) -> { + Result ret = new Result(); + ret.setDataInfo(getDataInfo()); BipScore first = it.next(); ret.setId(first.getId()); ret.setMeasures(getMeasure(first)); it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value))); - return (I) ret; - }, Encoders.bean(inputClazz)) + return ret; + }, Encoders.bean(Result.class)) .toJavaRDD() .map(p -> new AtomicAction(inputClazz, p)) .mapToPair( diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json index 67911eef1..7663a454b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json @@ -1,20 +1,20 @@ [ { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false -}, -{ -"paramName": "ip", -"paramLongName": "inputPath", -"paramDescription": "the URL from where to get the programme file", -"paramRequired": true -}, -{ -"paramName": "o", -"paramLongName": "outputPath", -"paramDescription": "the path of the new ActionSet", -"paramRequired": true -} + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "ip", + "paramLongName": "inputPath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json index ae844a0c9..31771a40a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json @@ -1,22 +1,22 @@ [ { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false -}, -{ -"paramName": "ip", -"paramLongName": "inputPath", -"paramDescription": "the URL from where to get the programme file", -"paramRequired": true -}, -{ -"paramName": "o", -"paramLongName": "outputPath", -"paramDescription": "the path of the new ActionSet", -"paramRequired": true -}, + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "ip", + "paramLongName": "inputPath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + }, { "paramName": "rtn", "paramLongName": "resultTableName", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml index ea2e9f58a..5311a6614 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml @@ -21,10 +21,10 @@ - - - - + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java index 4b02e7485..7200d2896 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java @@ -5,18 +5,12 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.ForeachFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; @@ -31,9 +25,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import scala.Tuple2; +import eu.dnetlib.dhp.schema.oaf.Publication; public class SparkAtomicActionScoreJobTest { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java index 5ca865e8f..d42c692f7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/CheckDuplictedIdsJob.java @@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob { IOUtils .toString( CheckDuplictedIdsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); - final String countPath = parser.get("workingPath") + "/counts"; + final String countPath = parser.get("outputDir") + "/counts"; log.info("countPath: {}", countPath); final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); @@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob { .map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG())) .write() .mode(SaveMode.Overwrite) + .option("compression", "gzip") .json(countPath); ; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java index cfee360c5..1ae241e34 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsJob.java @@ -44,10 +44,10 @@ public class GenerateEventsJob { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String eventsPath = workingPath + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); final Set dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist"); @@ -59,6 +59,9 @@ public class GenerateEventsJob { final Set dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist"); log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ",")); + final Set topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist"); + log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ",")); + final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { @@ -70,12 +73,12 @@ public class GenerateEventsJob { final LongAccumulator total = spark.sparkContext().longAccumulator("total_events"); final Dataset groups = ClusterUtils - .readPath(spark, workingPath + "/duplicates", ResultGroup.class); + .readPath(spark, workingDir + "/duplicates", ResultGroup.class); final Dataset dataset = groups .map( g -> EventFinder - .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators), + .generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators), Encoders .bean(EventGroup.class)) .flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java index d5c53ea36..2772f8fd1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateStatsJob.java @@ -46,7 +46,7 @@ public class GenerateStatsJob { final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); final String dbUrl = parser.get("dbUrl"); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java index d3cbe0034..e18a7ef56 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexEventSubsetJob.java @@ -46,7 +46,7 @@ public class IndexEventSubsetJob { final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); final String index = parser.get("index"); @@ -55,6 +55,18 @@ public class IndexEventSubsetJob { final String indexHost = parser.get("esHost"); log.info("indexHost: {}", indexHost); + final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount"); + log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount); + + final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait"); + log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait); + + final String esBatchSizeEntries = parser.get("esBatchSizeEntries"); + log.info("esBatchSizeEntries: {}", esBatchSizeEntries); + + final String esNodesWanOnly = parser.get("esNodesWanOnly"); + log.info("esNodesWanOnly: {}", esNodesWanOnly); + final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic")); log.info("maxEventsForTopic: {}", maxEventsForTopic); @@ -86,10 +98,10 @@ public class IndexEventSubsetJob { esCfg.put("es.index.auto.create", "false"); esCfg.put("es.nodes", indexHost); esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); + esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); + esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); + esCfg.put("es.batch.size.entries", esBatchSizeEntries); + esCfg.put("es.nodes.wan.only", esNodesWanOnly); log.info("*** Start indexing"); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java index 792a2354a..75f4eb066 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexNotificationsJob.java @@ -54,7 +54,7 @@ public class IndexNotificationsJob { final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); final String index = parser.get("index"); @@ -63,6 +63,18 @@ public class IndexNotificationsJob { final String indexHost = parser.get("esHost"); log.info("indexHost: {}", indexHost); + final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount"); + log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount); + + final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait"); + log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait); + + final String esBatchSizeEntries = parser.get("esBatchSizeEntries"); + log.info("esBatchSizeEntries: {}", esBatchSizeEntries); + + final String esNodesWanOnly = parser.get("esNodesWanOnly"); + log.info("esNodesWanOnly: {}", esNodesWanOnly); + final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl"); log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl); @@ -92,10 +104,10 @@ public class IndexNotificationsJob { esCfg.put("es.index.auto.create", "false"); esCfg.put("es.nodes", indexHost); esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); + esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); + esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); + esCfg.put("es.batch.size.entries", esBatchSizeEntries); + esCfg.put("es.nodes.wan.only", esNodesWanOnly); log.info("*** Start indexing"); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java index 762bfbb90..380a689e4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/IndexOnESJob.java @@ -36,7 +36,7 @@ public class IndexOnESJob { final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); final String index = parser.get("index"); @@ -45,6 +45,18 @@ public class IndexOnESJob { final String indexHost = parser.get("esHost"); log.info("indexHost: {}", indexHost); + final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount"); + log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount); + + final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait"); + log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait); + + final String esBatchSizeEntries = parser.get("esBatchSizeEntries"); + log.info("esBatchSizeEntries: {}", esBatchSizeEntries); + + final String esNodesWanOnly = parser.get("esNodesWanOnly"); + log.info("esNodesWanOnly: {}", esNodesWanOnly); + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); final JavaRDD inputRdd = ClusterUtils @@ -53,15 +65,13 @@ public class IndexOnESJob { .javaRDD(); final Map esCfg = new HashMap<>(); - // esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); - esCfg.put("es.index.auto.create", "false"); esCfg.put("es.nodes", indexHost); esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); + esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount); + esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait); + esCfg.put("es.batch.size.entries", esBatchSizeEntries); + esCfg.put("es.nodes.wan.only", esNodesWanOnly); JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java index 39fa76e43..01778ad74 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep0Job.java @@ -42,10 +42,10 @@ public class JoinStep0Job { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String joinedEntitiesPath = workingPath + "/joinedEntities_step0"; + final String joinedEntitiesPath = workingDir + "/joinedEntities_step0"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -57,10 +57,10 @@ public class JoinStep0Job { final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils - .readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class); final Dataset typedRels = ClusterUtils - .readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class); + .readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class); final TypedColumn, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator() .toColumn(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java index 8e502f736..82c3619e1 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep1Job.java @@ -40,10 +40,10 @@ public class JoinStep1Job { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String joinedEntitiesPath = workingPath + "/joinedEntities_step1"; + final String joinedEntitiesPath = workingDir + "/joinedEntities_step1"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -55,10 +55,10 @@ public class JoinStep1Job { final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class); final Dataset typedRels = ClusterUtils - .readPath(spark, workingPath + "/relatedProjects", RelatedProject.class); + .readPath(spark, workingDir + "/relatedProjects", RelatedProject.class); final TypedColumn, OaBrokerMainEntity> aggr = new RelatedProjectAggregator() .toColumn(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java index 55ab497f0..bd6135d41 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep2Job.java @@ -39,10 +39,10 @@ public class JoinStep2Job { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String joinedEntitiesPath = workingPath + "/joinedEntities_step2"; + final String joinedEntitiesPath = workingDir + "/joinedEntities_step2"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -54,10 +54,10 @@ public class JoinStep2Job { final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class); final Dataset typedRels = ClusterUtils - .readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class); + .readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class); final TypedColumn, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator() .toColumn(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java index 4d06f6f13..18e8c00b2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep3Job.java @@ -40,10 +40,10 @@ public class JoinStep3Job { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String joinedEntitiesPath = workingPath + "/joinedEntities_step3"; + final String joinedEntitiesPath = workingDir + "/joinedEntities_step3"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -55,10 +55,10 @@ public class JoinStep3Job { final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class); final Dataset typedRels = ClusterUtils - .readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class); + .readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class); final TypedColumn, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator() .toColumn(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java index b53d7e39b..965530362 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/JoinStep4Job.java @@ -40,10 +40,10 @@ public class JoinStep4Job { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String joinedEntitiesPath = workingPath + "/joinedEntities_step4"; + final String joinedEntitiesPath = workingDir + "/joinedEntities_step4"; log.info("joinedEntitiesPath: {}", joinedEntitiesPath); final SparkConf conf = new SparkConf(); @@ -55,10 +55,10 @@ public class JoinStep4Job { final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities"); final Dataset sources = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class); final Dataset typedRels = ClusterUtils - .readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class); + .readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class); final TypedColumn, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator() .toColumn(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java index e9644122f..e061c0d3b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PartitionEventsByDsIdJob.java @@ -36,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils; public class PartitionEventsByDsIdJob { private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class); - private static final String OPENDOAR_NSPREFIX = "10|opendoar____::"; + private static final String OPENDOAR_NSPREFIX = "opendoar____::"; public static void main(final String[] args) throws Exception { @@ -55,10 +55,10 @@ public class PartitionEventsByDsIdJob { final SparkConf conf = new SparkConf(); - final String eventsPath = parser.get("workingPath") + "/events"; + final String eventsPath = parser.get("outputDir") + "/events"; log.info("eventsPath: {}", eventsPath); - final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId"; + final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId"; log.info("partitionPath: {}", partitionPath); final String opendoarIds = parser.get("opendoarIds"); @@ -91,6 +91,7 @@ public class PartitionEventsByDsIdJob { .write() .partitionBy("group") .mode(SaveMode.Overwrite) + .option("compression", "gzip") .json(partitionPath); }); @@ -122,6 +123,7 @@ public class PartitionEventsByDsIdJob { final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId(); + res.setEventId(e.getEventId()); res.setOriginalId(payload.getResult().getOriginalId()); res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null)); res.setTopic(e.getTopic()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java index eb9add00d..dc156cbcf 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareGroupsJob.java @@ -45,10 +45,10 @@ public class PrepareGroupsJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String groupsPath = workingPath + "/duplicates"; + final String groupsPath = workingDir + "/duplicates"; log.info("groupsPath: {}", groupsPath); final SparkConf conf = new SparkConf(); @@ -60,10 +60,10 @@ public class PrepareGroupsJob { final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups"); final Dataset results = ClusterUtils - .readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class); + .readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class); final Dataset mergedRels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) + .loadRelations(graphPath, spark) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); final TypedColumn, ResultGroup> aggr = new ResultAggregator() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java index 0cfc1adcb..9bdf32a64 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasetsJob.java @@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String relsPath = workingPath + "/relatedDatasets"; + final String relsPath = workingDir + "/relatedDatasets"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob { .map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class)); final Dataset rels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) + .loadRelations(graphPath, spark) .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) @@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob { final Dataset dataset = rels .joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner") .map(t -> { - final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2); + final RelatedDataset rel = new RelatedDataset(t._1.getSource(), + t._2); rel.getRelDataset().setRelType(t._1.getRelClass()); return rel; }, Encoders.bean(RelatedDataset.class)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java index 166372a7f..0c2318127 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedDatasourcesJob.java @@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String relsPath = workingPath + "/relatedDatasources"; + final String relsPath = workingDir + "/relatedDatasources"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index e988366c8..9498c0f33 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String relsPath = workingPath + "/relatedProjects"; + final String relsPath = workingDir + "/relatedProjects"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob { .map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class)); final Dataset rels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) + .loadRelations(graphPath, spark) .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java index 724acc4dc..8270500fd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedPublicationsJob.java @@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String relsPath = workingPath + "/relatedPublications"; + final String relsPath = workingDir + "/relatedPublications"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob { Encoders.bean(OaBrokerRelatedPublication.class)); final Dataset rels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) + .loadRelations(graphPath, spark) .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass())) @@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob { final Dataset dataset = rels .joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner") .map(t -> { - final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2); + final RelatedPublication rel = new RelatedPublication( + t._1.getSource(), t._2); rel.getRelPublication().setRelType(t._1.getRelClass()); return rel; }, Encoders.bean(RelatedPublication.class)); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java index d15565d0d..16b450733 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedSoftwaresJob.java @@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String relsPath = workingPath + "/relatedSoftwares"; + final String relsPath = workingDir + "/relatedSoftwares"; log.info("relsPath: {}", relsPath); final SparkConf conf = new SparkConf(); @@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob { .map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class)); final Dataset rels = ClusterUtils - .readPath(spark, graphPath + "/relation", Relation.class) + .loadRelations(graphPath, spark) .filter(r -> r.getDataInfo().getDeletedbyinference()) .filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT)) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java index d3c7113ec..cf4450603 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareSimpleEntititiesJob.java @@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob { final String graphPath = parser.get("graphPath"); log.info("graphPath: {}", graphPath); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); - final String simpleEntitiesPath = workingPath + "/simpleEntities"; + final String simpleEntitiesPath = workingDir + "/simpleEntities"; log.info("simpleEntitiesPath: {}", simpleEntitiesPath); final SparkConf conf = new SparkConf(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 26ebbb7c0..cb3ea5464 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher { public EnrichMissingSubject() { super(20, - s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()), + s -> { + switch (s.getType().toLowerCase()) { + case "acm": + return Topic.ENRICH_MISSING_SUBJECT_ACM; + case "arxiv": + return Topic.ENRICH_MISSING_SUBJECT_ARXIV; + case "ddc": + return Topic.ENRICH_MISSING_SUBJECT_DDC; + case "jel": + return Topic.ENRICH_MISSING_SUBJECT_JEL; + case "mesh": + return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC; + case "rvk": + return Topic.ENRICH_MISSING_SUBJECT_RVK; + default: + return null; + } + }, (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index bbe6609d7..1f6edf96e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher { public EnrichMoreSubject() { super(20, - s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()), + s -> { + switch (s.getType().toLowerCase()) { + case "acm": + return Topic.ENRICH_MORE_SUBJECT_ACM; + case "arxiv": + return Topic.ENRICH_MORE_SUBJECT_ARXIV; + case "ddc": + return Topic.ENRICH_MORE_SUBJECT_DDC; + case "jel": + return Topic.ENRICH_MORE_SUBJECT_JEL; + case "mesh": + return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC; + case "rvk": + return Topic.ENRICH_MORE_SUBJECT_RVK; + default: + return null; + } + }, (p, s) -> p.getSubjects().add(s), s -> subjectAsString(s)); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index d8b8dd807..9ce64f6bd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.Relation; public class ClusterUtils { @@ -30,6 +31,16 @@ public class ClusterUtils { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } + public static Dataset loadRelations(final String graphPath, final SparkSession spark) { + return ClusterUtils + .readPath(spark, graphPath + "/relation", Relation.class) + .map(r -> { + r.setSource(ConversionUtils.cleanOpenaireId(r.getSource())); + r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget())); + return r; + }, Encoders.bean(Relation.class)); + } + public static Dataset readPath( final SparkSession spark, final String inputPath, @@ -67,6 +78,7 @@ public class ClusterUtils { .map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz)) .write() .mode(SaveMode.Overwrite) + .option("compression", "gzip") .json(path); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 053627a5f..ecbfd821e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -74,7 +74,7 @@ public class ConversionUtils { } final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); - res.setOpenaireId(d.getId()); + res.setOpenaireId(cleanOpenaireId(d.getId())); res.setOriginalId(first(d.getOriginalId())); res.setTitle(structPropValue(d.getTitle())); res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -89,7 +89,7 @@ public class ConversionUtils { } final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); - res.setOpenaireId(p.getId()); + res.setOpenaireId(cleanOpenaireId(p.getId())); res.setOriginalId(first(p.getOriginalId())); res.setTitle(structPropValue(p.getTitle())); res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); @@ -106,7 +106,7 @@ public class ConversionUtils { final OaBrokerMainEntity res = new OaBrokerMainEntity(); - res.setOpenaireId(result.getId()); + res.setOpenaireId(cleanOpenaireId(result.getId())); res.setOriginalId(first(result.getOriginalId())); res.setTypology(classId(result.getResulttype())); res.setTitles(structPropList(result.getTitle())); @@ -129,6 +129,10 @@ public class ConversionUtils { return res; } + public static String cleanOpenaireId(final String id) { + return id.contains("|") ? StringUtils.substringAfter(id, "|") : id; + } + private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { if (author == null) { return null; @@ -188,7 +192,7 @@ public class ConversionUtils { } final OaBrokerProject res = new OaBrokerProject(); - res.setOpenaireId(p.getId()); + res.setOpenaireId(cleanOpenaireId(p.getId())); res.setTitle(fieldValue(p.getTitle())); res.setAcronym(fieldValue(p.getAcronym())); res.setCode(fieldValue(p.getCode())); @@ -214,7 +218,7 @@ public class ConversionUtils { } final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); - res.setOpenaireId(sw.getId()); + res.setOpenaireId(cleanOpenaireId(sw.getId())); res.setName(structPropValue(sw.getTitle())); res.setDescription(fieldValue(sw.getDescription())); res.setRepository(fieldValue(sw.getCodeRepositoryUrl())); @@ -230,7 +234,7 @@ public class ConversionUtils { final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource(); res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname()))); - res.setOpenaireId(ds.getId()); + res.setOpenaireId(cleanOpenaireId(ds.getId())); res.setType(classId(ds.getDatasourcetype())); return res; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java index 75c4625ce..c693be93c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/DatasourceRelationsAccumulator.java @@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable { final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator(); collectedFromSet .stream() - .map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL)) + .map( + s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s), + BrokerConstants.COLLECTED_FROM_REL)) .forEach(res::addTuple); - hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple); + + hostedBySet + .stream() + .map( + s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s), + BrokerConstants.HOSTED_BY_REL)) + .forEach(res::addTuple); + return res; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java index 1ab56cc34..103751f95 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventFinder.java @@ -76,6 +76,7 @@ public class EventFinder { final Set dsIdWhitelist, final Set dsIdBlacklist, final Set dsTypeWhitelist, + final Set topicWhitelist, final Map accumulators) { final List> list = new ArrayList<>(); @@ -84,7 +85,13 @@ public class EventFinder { for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) { if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) { for (final UpdateMatcher matcher : matchers) { - list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)); + for (final UpdateInfo info : matcher + .searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) { + if (topicWhitelist == null || topicWhitelist.isEmpty() + || topicWhitelist.contains(info.getTopic().getPath())) { + list.add(info); + } + } } } } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json new file mode 100644 index 000000000..2584b78fc --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/check_duplicates.json @@ -0,0 +1,9 @@ +[ + + { + "paramName": "o", + "paramLongName": "outputDir", + "paramDescription": "the path where the data are stored", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json index adee1888a..0d942cd59 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/common_params.json @@ -7,7 +7,7 @@ }, { "paramName": "o", - "paramLongName": "workingPath", + "paramLongName": "workingDir", "paramDescription": "the path where the temporary data will be stored", "paramRequired": true } diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml index 14e33b091..ea80c3acf 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_all/oozie_app/workflow.xml @@ -6,7 +6,7 @@ the path where the graph is stored - workingPath + outputDir the path where the the generated data will be stored @@ -24,6 +24,11 @@ - a black list (comma separeted, - for empty list) of datasource ids + + topicWhitelist + * + a white list (comma separeted, * for all) of topics + esEventIndexName the elasticsearch index name for events @@ -36,6 +41,26 @@ esIndexHost the elasticsearch host + + esBatchWriteRetryCount + 8 + an ES configuration property + + + esBatchWriteRetryWait + 60s + an ES configuration property + + + esBatchSizeEntries + 200 + an ES configuration property + + + esNodesWanOnly + true + an ES configuration property + maxIndexedEventsForDsAndTopic the max number of events for each couple (ds/topic) @@ -111,15 +136,25 @@ - + + + + + ${wf:conf('resumeFrom') eq 'ensure_output_dir'} + ${wf:conf('resumeFrom') eq 'index_event_subset'} + ${wf:conf('resumeFrom') eq 'stats'} + ${wf:conf('resumeFrom') eq 'index_notifications'} + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + - + @@ -152,7 +187,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -176,7 +211,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -201,7 +236,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -225,7 +260,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -249,7 +284,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -273,7 +308,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -299,7 +334,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -323,7 +358,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -347,7 +382,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -371,7 +406,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -395,7 +430,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -419,7 +454,7 @@ --conf spark.sql.shuffle.partitions=3840 --graphPath${graphInputPath} - --workingPath${workingPath} + --workingDir${workingDir} @@ -442,10 +477,12 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --workingPath${workingPath} + --workingDir${workingDir} + --outputDir${outputDir} --datasourceIdWhitelist${datasourceIdWhitelist} --datasourceTypeWhitelist${datasourceTypeWhitelist} --datasourceIdBlacklist${datasourceIdBlacklist} + --topicWhitelist${topicWhitelist} @@ -468,38 +505,16 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --workingPath${workingPath} + --outputDir${outputDir} --index${esEventIndexName} --esHost${esIndexHost} + --esBatchWriteRetryCount${esBatchWriteRetryCount} + --esBatchWriteRetryWait${esBatchWriteRetryWait} + --esBatchSizeEntries${esBatchSizeEntries} + --esNodesWanOnly${esNodesWanOnly} --maxEventsForTopic${maxIndexedEventsForDsAndTopic} --brokerApiBaseUrl${brokerApiBaseUrl} - - - - - - - yarn - cluster - IndexNotificationsOnESJob - eu.dnetlib.dhp.broker.oa.IndexNotificationsJob - dhp-broker-events-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.dynamicAllocation.maxExecutors="8" - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=3840 - - --workingPath${workingPath} - --index${esNotificationsIndexName} - --esHost${esIndexHost} - --brokerApiBaseUrl${brokerApiBaseUrl} - @@ -521,16 +536,46 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --workingPath${workingPath} + --outputDir${outputDir} --dbUrl${brokerDbUrl} --dbUser${brokerDbUser} --dbPassword${brokerDbPassword} --brokerApiBaseUrl${brokerApiBaseUrl} + + + + + + + yarn + cluster + IndexNotificationsOnESJob + eu.dnetlib.dhp.broker.oa.IndexNotificationsJob + dhp-broker-events-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --outputDir${outputDir} + --index${esNotificationsIndexName} + --esHost${esIndexHost} + --esBatchWriteRetryCount${esBatchWriteRetryCount} + --esBatchWriteRetryWait${esBatchWriteRetryWait} + --esBatchSizeEntries${esBatchSizeEntries} + --esNodesWanOnly${esNodesWanOnly} + --brokerApiBaseUrl${brokerApiBaseUrl} + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json index bab808193..e803bb5b9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/generate_events.json @@ -1,7 +1,13 @@ [ + { + "paramName": "wp", + "paramLongName": "workingDir", + "paramDescription": "the path where the temporary data are stored", + "paramRequired": true + }, { "paramName": "o", - "paramLongName": "workingPath", + "paramLongName": "outputDir", "paramDescription": "the path where the generated events will be stored", "paramRequired": true }, @@ -22,5 +28,11 @@ "paramLongName": "datasourceIdBlacklist", "paramDescription": "a black list (comma separeted, - for empty list) of datasource ids", "paramRequired": true + }, + { + "paramName": "topicWhitelist", + "paramLongName": "topicWhitelist", + "paramDescription": "a white list (comma separeted, * for all) of topics", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json index ac1dbf786..f7e072d0f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_es.json @@ -1,8 +1,8 @@ [ { "paramName": "o", - "paramLongName": "workingPath", - "paramDescription": "the workinh path", + "paramLongName": "outputDir", + "paramDescription": "the data path", "paramRequired": true }, { @@ -16,5 +16,29 @@ "paramLongName": "esHost", "paramDescription": "the ES host", "paramRequired": true + }, + { + "paramName": "esBatchWriteRetryCount", + "paramLongName": "esBatchWriteRetryCount", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esBatchWriteRetryWait", + "paramLongName": "esBatchWriteRetryWait", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esBatchSizeEntries", + "paramLongName": "esBatchSizeEntries", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esNodesWanOnly", + "paramLongName": "esNodesWanOnly", + "paramDescription": "an ES configuration property", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json index 4921bc03e..0046490bb 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_event_subset.json @@ -1,8 +1,8 @@ [ { "paramName": "o", - "paramLongName": "workingPath", - "paramDescription": "the workinh path", + "paramLongName": "outputDir", + "paramDescription": "the path where the generated data are stored", "paramRequired": true }, { @@ -16,7 +16,31 @@ "paramLongName": "esHost", "paramDescription": "the ES host", "paramRequired": true + }, + { + "paramName": "esBatchWriteRetryCount", + "paramLongName": "esBatchWriteRetryCount", + "paramDescription": "an ES configuration property", + "paramRequired": true }, + { + "paramName": "esBatchWriteRetryWait", + "paramLongName": "esBatchWriteRetryWait", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esBatchSizeEntries", + "paramLongName": "esBatchSizeEntries", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esNodesWanOnly", + "paramLongName": "esNodesWanOnly", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, { "paramName": "n", "paramLongName": "maxEventsForTopic", diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json index 5eea894c8..370b48411 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/index_notifications.json @@ -1,8 +1,8 @@ [ { "paramName": "o", - "paramLongName": "workingPath", - "paramDescription": "the workinh path", + "paramLongName": "outputDir", + "paramDescription": "the dir that contains the events folder", "paramRequired": true }, { @@ -17,6 +17,30 @@ "paramDescription": "the ES host", "paramRequired": true }, + { + "paramName": "esBatchWriteRetryCount", + "paramLongName": "esBatchWriteRetryCount", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esBatchWriteRetryWait", + "paramLongName": "esBatchWriteRetryWait", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esBatchSizeEntries", + "paramLongName": "esBatchSizeEntries", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, + { + "paramName": "esNodesWanOnly", + "paramLongName": "esNodesWanOnly", + "paramDescription": "an ES configuration property", + "paramRequired": true + }, { "paramName": "broker", "paramLongName": "brokerApiBaseUrl", diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml index 879c0d349..248326d57 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml @@ -6,8 +6,8 @@ the path where the graph is stored - workingPath - the path where the the generated data will be stored + outputDir + the path where the the generated data are stored datasourceIdWhitelist @@ -36,6 +36,26 @@ esIndexHost the elasticsearch host + + esBatchWriteRetryCount + 8 + an ES configuration property + + + esBatchWriteRetryWait + 60s + an ES configuration property + + + esBatchSizeEntries + 200 + an ES configuration property + + + esNodesWanOnly + true + an ES configuration property + maxIndexedEventsForDsAndTopic the max number of events for each couple (ds/topic) @@ -122,9 +142,13 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --workingPath${workingPath} + --outputDir${outputDir} --index${esNotificationsIndexName} --esHost${esIndexHost} + --esBatchWriteRetryCount${esBatchWriteRetryCount} + --esBatchWriteRetryWait${esBatchWriteRetryWait} + --esBatchSizeEntries${esBatchSizeEntries} + --esNodesWanOnly${esNodesWanOnly} --brokerApiBaseUrl${brokerApiBaseUrl} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json index 10ba926ab..12cd6a391 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/od_partitions_params.json @@ -1,8 +1,8 @@ [ { "paramName": "o", - "paramLongName": "workingPath", - "paramDescription": "the path where the temporary data will be stored", + "paramLongName": "outputDir", + "paramDescription": "the path where the data will be stored", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml index dba3c9f73..7c5b722d2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/opendoarPartition/oozie_app/workflow.xml @@ -6,7 +6,7 @@ the opendoar IDs whitelist (comma separated) - workingPath + outputDir the path where the the generated data will be stored @@ -87,7 +87,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 - --workingPath${workingPath} + --outputDir${outputDir} --opendoarIds${opendoarIds} diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml new file mode 100644 index 000000000..9095004ad --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/reindex/oozie_app/workflow.xml @@ -0,0 +1,140 @@ + + + + + outputDir + the path where the the generated data will be stored + + + esEventIndexName + the elasticsearch index name for events + + + esIndexHost + the elasticsearch host + + + esBatchWriteRetryCount + 8 + an ES configuration property + + + esBatchWriteRetryWait + 60s + an ES configuration property + + + esBatchSizeEntries + 200 + an ES configuration property + + + esNodesWanOnly + true + an ES configuration property + + + maxIndexedEventsForDsAndTopic + the max number of events for each couple (ds/topic) + + + brokerApiBaseUrl + the url of the broker service api + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + IndexEventSubsetOnESJob + eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob + dhp-broker-events-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.dynamicAllocation.maxExecutors="8" + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --outputDir${outputDir} + --index${esEventIndexName} + --esHost${esIndexHost} + --esBatchWriteRetryCount${esBatchWriteRetryCount} + --esBatchWriteRetryWait${esBatchWriteRetryWait} + --esBatchSizeEntries${esBatchSizeEntries} + --esNodesWanOnly${esNodesWanOnly} + --maxEventsForTopic${maxIndexedEventsForDsAndTopic} + --brokerApiBaseUrl${brokerApiBaseUrl} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/workflow.xml new file mode 100644 index 000000000..218af4515 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats/oozie_app/workflow.xml @@ -0,0 +1,114 @@ + + + + + outputDir + the path where the the generated data will be stored + + + brokerApiBaseUrl + the url of the broker service api + + + brokerDbUrl + the url of the broker database + + + brokerDbUser + the user of the broker database + + + brokerDbPassword + the password of the broker database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + GenerateStatsJob + eu.dnetlib.dhp.broker.oa.GenerateStatsJob + dhp-broker-events-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --outputDir${outputDir} + --dbUrl${brokerDbUrl} + --dbUser${brokerDbUser} + --dbPassword${brokerDbPassword} + --brokerApiBaseUrl${brokerApiBaseUrl} + + + + + + + + diff --git a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json index 15d7d251f..2388b1c1f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json +++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/stats_params.json @@ -1,8 +1,8 @@ [ { - "paramName": "wp", - "paramLongName": "workingPath", - "paramDescription": "the working path", + "paramName": "o", + "paramLongName": "outputDir", + "paramDescription": "the path where generated data are stored", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 8646ac742..9bc90d51d 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -7,7 +7,6 @@ import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; @@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; -import org.codehaus.jackson.map.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.hash.Hashing; import eu.dnetlib.dedup.graph.ConnectedComponent; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index 572824e3d..7adf992cd 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; -import org.codehaus.jackson.map.ObjectMapper; + +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Oaf; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 9c9221b27..683986de2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils -import org.codehaus.jackson.map.ObjectMapper +import com.fasterxml.jackson.databind.ObjectMapper import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ -import scala.io.Source case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} @@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){} object DoiBoostMappingUtil { - def getUnknownCountry(): Qualifier = { - createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries") - } - - def generateMAGAffiliationId(affId: String): String = { s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}" } - val logger: Logger = LoggerFactory.getLogger(getClass) //STATIC STRING val MAG = "microsoft" val MAG_NAME = "Microsoft Academic Graph" - val ORCID = "ORCID" + val ORCID = "orcid" + val ORCID_PENDING = "orcid_pending" val CROSSREF = "Crossref" val UNPAYWALL = "UnpayWall" val GRID_AC = "grid.ac" diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index 7a6cd3faa..78477ae4d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet { val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") val dbOrganizationPath = parser.get("dbOrganizationPath") val workingDirPath = parser.get("targetPath") + val sequenceFilePath = parser.get("sFilePath") - spark.read.load(dbDatasetPath).as[OafDataset] + val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] .map(d =>DoiBoostMappingUtil.fixResult(d)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet") +// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet") - spark.read.load(dbPublicationPath).as[Publication] + val asPublication =spark.read.load(dbPublicationPath).as[Publication] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") +// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") - spark.read.load(dbOrganizationPath).as[Organization] + val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") +// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") - spark.read.load(crossRefRelation).as[Relation] + val asCRelation = spark.read.load(crossRefRelation).as[Relation] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") +// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") - spark.read.load(dbaffiliationRelationPath).as[Relation] + val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) - .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") +// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") - val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)] - d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + + val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) + +// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)] + + d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 860254527..11f9828db 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -2,6 +2,7 @@ package eu.dnetlib.doiboost import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.merge.AuthorMerger +import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset} import eu.dnetlib.doiboost.mag.ConversionUtil import org.apache.commons.io.IOUtils @@ -30,7 +31,7 @@ object SparkGenerateDoiBoost { import spark.implicits._ val hostedByMapPath = parser.get("hostedByMapPath") - val workingDirPath = parser.get("workingDirPath") + val workingDirPath = parser.get("workingPath") implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] @@ -132,7 +133,7 @@ object SparkGenerateDoiBoost { o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get)) if (affiliation.OfficialPage.isDefined) o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get)) - o.setCountry(DoiBoostMappingUtil.getUnknownCountry()) + o.setCountry(ModelConstants.UNKNOWN_COUNTRY) o } else diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index d87bc79fc..1e52e93c1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -206,7 +206,7 @@ case object Crossref2Oaf { a.setSurname(family) a.setFullname(s"$given $family") if (StringUtils.isNotBlank(orcid)) - a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava) + a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava) a } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java index f69a05da1..cda4983b7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java @@ -2,18 +2,16 @@ package eu.dnetlib.doiboost.crossref; import java.io.ByteArrayOutputStream; +import java.util.Optional; import java.util.zip.Inflater; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -30,34 +28,45 @@ public class CrossrefImporter { parser.parseArgument(args); - final String hdfsuri = parser.get("namenode"); - System.out.println("HDFS URI" + hdfsuri); - Path hdfswritepath = new Path(parser.get("targetPath")); - System.out.println("TargetPath: " + hdfsuri); + final String namenode = parser.get("namenode"); + System.out.println("namenode: " + namenode); - final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp")) - ? Long.parseLong(parser.get("timestamp")) - : -1; + Path targetPath = new Path(parser.get("targetPath")); + System.out.println("targetPath: " + targetPath); - if (timestamp > 0) - System.out.println("Timestamp added " + timestamp); + final Long timestamp = Optional + .ofNullable(parser.get("timestamp")) + .map(s -> { + try { + return Long.parseLong(s); + } catch (NumberFormatException e) { + return -1L; + } + }) + .orElse(-1L); + System.out.println("timestamp: " + timestamp); + + final String esServer = parser.get("esServer"); + System.out.println("esServer: " + esServer); + + final String esIndex = parser.get("esIndex"); + System.out.println("esIndex: " + esIndex); // ====== Init HDFS File System Object Configuration conf = new Configuration(); // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); + conf.set("fs.defaultFS", namenode); // Because of Maven conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - ESClient client = timestamp > 0 - ? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp) - : new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref"); + // "ip-90-147-167-25.ct1.garrservices.it", "crossref" + final ESClient client = new ESClient(esServer, esIndex, timestamp); try (SequenceFile.Writer writer = SequenceFile .createWriter( conf, - SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.file(targetPath), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { @@ -74,8 +83,7 @@ public class CrossrefImporter { end = System.currentTimeMillis(); final float time = (end - start) / 1000.0F; System.out - .println( - String.format("Imported %d records last 100000 imported in %f seconds", i, time)); + .println(String.format("Imported %s records last 100000 imported in %s seconds", i, time)); start = System.currentTimeMillis(); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java index e31ccf399..dcebbbcac 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java @@ -1,11 +1,11 @@ package eu.dnetlib.doiboost.crossref; -import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.commons.io.IOUtils; +import org.apache.http.HttpHeaders; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; @@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory; import com.jayway.jsonpath.JsonPath; public class ESClient implements Iterator { - private static final Logger logger = LoggerFactory.getLogger(ESClient.class); - static final String blobPath = "$.hits[*].hits[*]._source.blob"; - static final String scrollIdPath = "$._scroll_id"; - static final String JSON_NO_TS = "{\"size\":1000}"; - static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; - static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; + private static final String BLOB_PATH = "$.hits.hits[*]._source.blob"; + private static final String SCROLL_ID_PATH = "$._scroll_id"; + private static final String JSON_NO_TS = "{\"size\":1000}"; + private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; + private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; + + public static final String APPLICATION_JSON = "application/json"; + + public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m"; + public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll"; private final String scrollId; @@ -31,47 +35,30 @@ public class ESClient implements Iterator { private final String esHost; - public ESClient(final String esHost, final String esIndex) throws IOException { - + public ESClient(final String esHost, final String esIndex, final long timestamp) { this.esHost = esHost; - final String body = getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } - public ESClient(final String esHost, final String esIndex, final long timestamp) - throws IOException { - this.esHost = esHost; - final String body = getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), - String.format(JSON_WITH_TS, timestamp)); - scrollId = getJPathString(scrollIdPath, body); + final String body = timestamp > 0 + ? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp)) + : getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS); + scrollId = getJPathString(SCROLL_ID_PATH, body); buffer = getBlobs(body); } private String getResponse(final String url, final String json) { - CloseableHttpClient client = HttpClients.createDefault(); - try { - + try (CloseableHttpClient client = HttpClients.createDefault()) { HttpPost httpPost = new HttpPost(url); if (json != null) { StringEntity entity = new StringEntity(json); httpPost.setEntity(entity); - httpPost.setHeader("Accept", "application/json"); - httpPost.setHeader("Content-type", "application/json"); + httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON); + httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON); + } + try (CloseableHttpResponse response = client.execute(httpPost)) { + return IOUtils.toString(response.getEntity().getContent()); } - CloseableHttpResponse response = client.execute(httpPost); - - return IOUtils.toString(response.getEntity().getContent()); } catch (Throwable e) { throw new RuntimeException("Error on executing request ", e); - } finally { - try { - client.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close client ", e); - } } } @@ -87,7 +74,7 @@ public class ESClient implements Iterator { } private List getBlobs(final String body) { - final List res = JsonPath.read(body, "$.hits.hits[*]._source.blob"); + final List res = JsonPath.read(body, BLOB_PATH); return res; } @@ -102,11 +89,11 @@ public class ESClient implements Iterator { if (buffer.isEmpty()) { final String json_param = String.format(JSON_SCROLL, scrollId); - final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param); try { buffer = getBlobs(body); } catch (Throwable e) { - logger.error("Error on get next page: body:" + body); + System.out.println("Error on get next page: body:" + body); } } return nextItem; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 02dc4979a..780e65c1e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkPreProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ -object SparkPreProcessMAG { +object SparkProcessMAG { def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 1d669f4b5..ccf005ce1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -1,11 +1,11 @@ package eu.dnetlib.doiboost.orcid +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.dhp.schema.orcid.OrcidDOI import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier} import org.apache.commons.lang.StringUtils -import org.codehaus.jackson.map.ObjectMapper import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ @@ -18,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {} object ORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) - val mapper = new ObjectMapper + val mapper = new ObjectMapper() def isJsonValid(inputStr: String): Boolean = { import java.io.IOException diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java index 598835a00..1422a0840 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkDownloadOrcidAuthors.java @@ -3,10 +3,8 @@ package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.List; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -18,11 +16,9 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.util.LongAccumulator; -import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +32,7 @@ public class SparkDownloadOrcidAuthors { static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; static final String lastUpdate = "2020-09-29 00:00:00"; - public static void main(String[] args) throws IOException, Exception { + public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -51,12 +47,12 @@ public class SparkDownloadOrcidAuthors { .orElse(Boolean.TRUE); logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String workingPath = parser.get("workingPath"); - logger.info("workingPath: ", workingPath); + logger.info("workingPath: {}", workingPath); final String outputPath = parser.get("outputPath"); - logger.info("outputPath: ", outputPath); + logger.info("outputPath: {}", outputPath); final String token = parser.get("token"); final String lambdaFileName = parser.get("lambdaFileName"); - logger.info("lambdaFileName: ", lambdaFileName); + logger.info("lambdaFileName: {}", lambdaFileName); SparkConf conf = new SparkConf(); runWithSparkSession( @@ -171,8 +167,8 @@ public class SparkDownloadOrcidAuthors { } private static boolean isModified(String orcidId, String modifiedDate) { - Date modifiedDateDt = null; - Date lastUpdateDt = null; + Date modifiedDateDt; + Date lastUpdateDt; try { if (modifiedDate.length() != 19) { modifiedDate = modifiedDate.substring(0, 19); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json index 6eb1ec6f1..93d24a891 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json @@ -5,5 +5,6 @@ {"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true}, {"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true}, {"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true}, - {"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true} + {"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}, + {"paramName": "sp", "paramLongName":"sFilePath", "paramDescription": "the Sequence file Path", "paramRequired": true} ] diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json index ea08f47d4..1ff63dd0e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json @@ -3,5 +3,5 @@ {"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true}, {"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true}, {"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true}, - {"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true} + {"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true} ] diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json index 87a138d52..0920d516a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/import_from_es.json @@ -1,5 +1,7 @@ [ - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true}, - {"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false} + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true}, + {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true}, + {"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false}, + {"paramName":"ess", "paramLongName":"esServer", "paramDescription": "elasticsearch server url", "paramRequired": true}, + {"paramName":"esi", "paramLongName":"esIndex", "paramDescription": "elasticsearch index name", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml similarity index 68% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml index a1755f329..508202e30 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml @@ -15,6 +15,10 @@ oozie.action.sharelib.for.spark spark2 + + oozie.launcher.mapreduce.user.classpath.first + true + hive_metastore_uris thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 @@ -23,36 +27,16 @@ spark2YarnHistoryServerAddress http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - - - oozie.launcher.mapreduce.user.classpath.first - true - - - sparkExecutorNumber - 4 - spark2EventLogDir /user/spark/spark2ApplicationHistory - sparkDriverMemory - 15G + spark2ExtraListeners + "com.cloudera.spark.lineage.NavigatorAppListener" - sparkExecutorMemory - 6G - - - sparkExecutorCores - 1 + spark2SqlQueryExecutionListeners + "com.cloudera.spark.lineage.NavigatorQueryListener" \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml new file mode 100644 index 000000000..3f5805b62 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml @@ -0,0 +1,335 @@ + + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorIntersectionMemory + memory for individual executor + + + + sparkExecutorCores + number of cores used by single executor + + + + + + workingPath + the working Path + + + + hostedByMapPath + the hostedByMap Path + + + outputPath + the Path of the sequence file action set + + + + + + inputPathCrossref + the Crossref input path + + + crossrefTimestamp + Timestamp for the Crossref incremental Harvesting + + + esServer + elasticsearch server url for the Crossref Harvesting + + + esIndex + elasticsearch index name for the Crossref Harvesting + + + + + inputPathMAG + the MAG working path + + + + + + inputPathUnpayWall + the UnpayWall working path + + + + + inputPathOrcid + the ORCID working path + + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + ${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'} + ${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'} + ${wf:conf('resumeFrom') eq 'PreprocessMag'} + ${wf:conf('resumeFrom') eq 'PreprocessUW'} + ${wf:conf('resumeFrom') eq 'PreprocessORCID'} + ${wf:conf('resumeFrom') eq 'CreateDOIBoost'} + ${wf:conf('resumeFrom') eq 'GenerateActionSet'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.doiboost.crossref.CrossrefImporter + --targetPath${inputPathCrossref}/index_update + --namenode${nameNode} + --esServer${esServer} + --esIndex${esIndex} + --timestamp${crossrefTimestamp} + + + + + + + + + + + yarn-cluster + cluster + GenerateCrossrefDataset + eu.dnetlib.doiboost.crossref.CrossrefDataset + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --workingPath${inputPathCrossref} + --masteryarn-cluster + + + + + + + + + + + + + + + + + + yarn-cluster + cluster + ConvertCrossrefToOAF + eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --sourcePath${inputPathCrossref}/crossref_ds + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + + + + + + + + + + + + + + yarn-cluster + cluster + Convert Mag to Dataset + eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + ${sparkExtraOPT} + + --sourcePath${inputPathMAG}/input + --targetPath${inputPathMAG}/dataset + --masteryarn-cluster + + + + + + + + yarn-cluster + cluster + Convert Mag to OAF Dataset + eu.dnetlib.doiboost.mag.SparkProcessMAG + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --sourcePath${inputPathMAG}/dataset + --workingPath${inputPathMAG}/process + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + + + yarn-cluster + cluster + Convert UnpayWall to Dataset + eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --sourcePath${inputPathUnpayWall}/uw_extracted + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Convert ORCID to Dataset + eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --sourcePath${inputPathOrcid} + --targetPath${workingPath} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Create DOIBoost Infospace + eu.dnetlib.doiboost.SparkGenerateDoiBoost + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorIntersectionMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --hostedByMapPath${hostedByMapPath} + --affiliationPath${inputPathMAG}/process/Affiliations + --paperAffiliationPath${inputPathMAG}/process/PaperAuthorAffiliations + --workingPath${workingPath} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Generate DOIBoost ActionSet + eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --dbPublicationPath${workingPath}/doiBoostPublicationFiltered + --dbDatasetPath${workingPath}/crossrefDataset + --crossRefRelation${workingPath}/crossrefRelation + --dbaffiliationRelationPath${workingPath}/doiBoostPublicationAffiliation + --dbOrganizationPath${workingPath}/doiBoostOrganization + --targetPath${workingPath}/actionDataSet + --sFilePath${outputPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala index 0222b393d..a1b3d06b7 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -1,9 +1,9 @@ package eu.dnetlib.doiboost.orcid +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass import org.apache.spark.sql.{Encoder, Encoders, SparkSession} -import org.codehaus.jackson.map.ObjectMapper import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test import org.slf4j.{Logger, LoggerFactory} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index dd8342980..b3ef3a112 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -104,7 +104,7 @@ public class PrepareResultOrcidAssociationStep1 { + " LATERAL VIEW EXPLODE (author) a AS MyT " + " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP " + " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or " - + " lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp " + + " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp " + " GROUP BY id) r_t " + " JOIN (" + " SELECT source, target " diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index 66297e177..60ad43859 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob { .stream() .map(con -> con.getId()) .collect(Collectors.toList()); - Result res = new Result(); + R res = (R) ret.getClass().newInstance(); res.setId(ret.getId()); List propagatedContexts = new ArrayList<>(); for (String cId : communitySet) { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java index 0c613d1b4..5ac117693 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java @@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob { }) .filter(Objects::nonNull) .collect(Collectors.toList()); - Result r = new Result(); + R r = (R) ret.getClass().newInstance(); r.setId(ret.getId()); r.setContext(contextList); ret.mergeFrom(r); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index a8e1ab841..7709e00a8 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; import eu.dnetlib.dhp.schema.oaf.Dataset; public class ResultToCommunityJobTest { @@ -66,7 +65,7 @@ public class ResultToCommunityJobTest { } @Test - public void test1() throws Exception { + public void testSparkResultToCommunityThroughSemRelJob() throws Exception { SparkResultToCommunityThroughSemRelJob .main( new String[] { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index b6210013c..3adbd244c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -23,7 +23,15 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM; -import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty; import java.io.Closeable; import java.io.IOException; @@ -462,44 +470,48 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i return Arrays.asList(r); } else { + final String validationDate = rs.getString("curation_date"); + final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false); final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false); final Relation r1 = new Relation(); final Relation r2 = new Relation(); - if (rs.getString(SOURCE_TYPE).equals("project")) { - r1.setCollectedfrom(collectedFrom); - r1.setRelType(RESULT_PROJECT); - r1.setSubRelType(OUTCOME); - r1.setRelClass(PRODUCES); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType(RESULT_PROJECT); - r2.setSubRelType(OUTCOME); - r2.setRelClass(IS_PRODUCED_BY); - } else { - r1.setCollectedfrom(collectedFrom); - r1.setRelType(RESULT_RESULT); - r1.setSubRelType(RELATIONSHIP); - r1.setRelClass(IS_RELATED_TO); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType(RESULT_RESULT); - r2.setSubRelType(RELATIONSHIP); - r2.setRelClass(IS_RELATED_TO); - } - + r1.setValidated(true); + r1.setValidationDate(validationDate); + r1.setCollectedfrom(collectedFrom); r1.setSource(sourceId); r1.setTarget(targetId); r1.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp); + r2.setValidationDate(validationDate); + r2.setValidated(true); + r2.setCollectedfrom(collectedFrom); r2.setSource(targetId); r2.setTarget(sourceId); r2.setDataInfo(info); r2.setLastupdatetimestamp(lastUpdateTimestamp); + if (rs.getString(SOURCE_TYPE).equals("project")) { + r1.setRelType(RESULT_PROJECT); + r1.setSubRelType(OUTCOME); + r1.setRelClass(PRODUCES); + + r2.setRelType(RESULT_PROJECT); + r2.setSubRelType(OUTCOME); + r2.setRelClass(IS_PRODUCED_BY); + } else { + r1.setRelType(RESULT_RESULT); + r1.setSubRelType(RELATIONSHIP); + r1.setRelClass(IS_RELATED_TO); + + r2.setRelType(RESULT_RESULT); + r2.setSubRelType(RELATIONSHIP); + r2.setRelClass(IS_RELATED_TO); + } + return Arrays.asList(r1, r2); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 88a29fdd7..cddd00ad7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -36,31 +36,34 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final Node n = (Node) o; final Author author = new Author(); final String fullname = n.valueOf("./datacite:creatorName"); - author.setFullname(fullname); - - final PacePerson pp = new PacePerson(fullname, false); final String name = n.valueOf("./datacite:givenName"); - if (StringUtils.isBlank(name) & pp.isAccurate()) { - author.setName(pp.getNormalisedFirstName()); - } else { - author.setName(name); - } - final String surname = n.valueOf("./datacite:familyName"); - if (StringUtils.isBlank(surname) & pp.isAccurate()) { - author.setSurname(pp.getNormalisedSurname()); - } else { - author.setSurname(surname); - } + if (StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)) { + author.setFullname(fullname); - if (StringUtils.isBlank(author.getFullname())) { - author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); - } + final PacePerson pp = new PacePerson(fullname, false); - author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); - author.setPid(preparePids(n, info)); - author.setRank(pos++); - res.add(author); + if (StringUtils.isBlank(name) & pp.isAccurate()) { + author.setName(pp.getNormalisedFirstName()); + } else { + author.setName(name); + } + + if (StringUtils.isBlank(surname) & pp.isAccurate()) { + author.setSurname(pp.getNormalisedSurname()); + } else { + author.setSurname(surname); + } + + if (StringUtils.isBlank(author.getFullname())) { + author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); + } + + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); + author.setPid(preparePids(n, info)); + author.setRank(pos++); + res.add(author); + } } return res; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index f7579c0a0..a0ce4f5a6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -11,7 +11,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import org.codehaus.jackson.map.ObjectMapper; + +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.Oaf; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryClaims.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryClaims.sql index 0390c11aa..f912d3ce9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryClaims.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryClaims.sql @@ -1 +1 @@ -SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE; \ No newline at end of file +SELECT source_type, source_id, target_type, target_id, semantics, curation_date::text FROM claim WHERE approved=TRUE; \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 5fc3cb5d0..baced2495 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -24,8 +24,14 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) @@ -65,7 +71,7 @@ public class MappersTest { assertValidId(p.getId()); - assertTrue(p.getOriginalId().size() == 2); + assertTrue(p.getOriginalId().size() == 1); assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0)); assertValidId(p.getCollectedfrom().get(0).getKey()); @@ -119,8 +125,26 @@ public class MappersTest { assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); - verifyRelations(p, r1, r2); - + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertTrue(r1.getValidated()); + assertTrue(r2.getValidated()); + assertEquals(r1.getValidationDate(), "2020-01-01"); + assertEquals(r2.getValidationDate(), "2020-01-01"); // System.out.println(new ObjectMapper().writeValueAsString(p)); // System.out.println(new ObjectMapper().writeValueAsString(r1)); // System.out.println(new ObjectMapper().writeValueAsString(r2)); @@ -158,7 +182,7 @@ public class MappersTest { final Relation r2 = (Relation) list.get(2); assertValidId(d.getId()); - assertTrue(d.getOriginalId().size() == 2); + assertTrue(d.getOriginalId().size() == 1); assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0)); assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); @@ -211,19 +235,10 @@ public class MappersTest { }); assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); - verifyRelations(d, r1, r2); - } - - private void verifyRelations(OafEntity e, Relation r1, Relation r2) { - assertEquals(e.getId(), r1.getSource()); - assertEquals(e.getId(), r2.getTarget()); - assertValidId(r1.getSource()); assertValidId(r1.getTarget()); assertValidId(r2.getSource()); assertValidId(r2.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); assertNotNull(r1.getDataInfo()); assertNotNull(r2.getDataInfo()); assertNotNull(r1.getDataInfo().getTrust()); @@ -234,6 +249,10 @@ public class MappersTest { assertTrue(StringUtils.isNotBlank(r2.getRelClass())); assertTrue(StringUtils.isNotBlank(r1.getRelType())); assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertTrue(r1.getValidated()); + assertTrue(r2.getValidated()); + assertEquals(r1.getValidationDate(), "2020-01-01"); + assertEquals(r2.getValidationDate(), "2020-01-01"); } @Test @@ -343,6 +362,37 @@ public class MappersTest { assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals(1, p.getAuthor().size()); + assertEquals("OPEN", p.getBestaccessright().getClassid()); + assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getValue())); + assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getQualifier().getClassid())); + assertEquals("dataset", p.getResulttype().getClassname()); + assertEquals(1, p.getInstance().size()); + assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid()); + assertValidId(p.getInstance().get(0).getCollectedfrom().getKey()); + assertValidId(p.getInstance().get(0).getHostedby().getKey()); + assertEquals( + "http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue()); + assertEquals(1, p.getInstance().get(0).getUrl().size()); +// System.out.println(p.getInstance().get(0).getUrl().get(0)); +// System.out.println(p.getInstance().get(0).getHostedby().getValue()); + System.out.println(p.getPid().get(0).getValue()); + } + + @Test + void testBologna() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf-bologna.xml")); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + System.out.println(p.getTitle().get(0).getValue()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); System.out.println(p.getTitle().get(0).getValue()); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 9cf75f208..0d1ec1ad1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -35,7 +35,7 @@ public class MigrateDbEntitiesApplicationTest { private MigrateDbEntitiesApplication app; - @Mock + @Mock(lenient = true) private ResultSet rs; @Mock diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index 67c070d1d..ba47aaf5c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1047,6 +1047,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle +dnet:pid_types @=@ dnet:pid_types @=@ data.europa.eu @=@ EU Persistent URL dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/eu_odp.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/eu_odp.xml new file mode 100644 index 000000000..c363d026c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/eu_odp.xml @@ -0,0 +1,368 @@ + + + + r3c4b2081b22::0007d64b38bb2b399120f9993f95d911 + 60a0a5b0-b63c-473f-b8bc-207ea037eb3b + 2021-01-28T17:24:33.095+01:00 + r3c4b2081b22 + 2021-02-03T16:57:03.099+01:00 + + + + http://data.europa.eu/88u/dataset/60a0a5b0-b63c-473f-b8bc-207ea037eb3b + + + GMIS - Favourable feeding habitat of adult Atlantic bluefin tuna (ABFT) Monthly 1998-2017 (frequency of occurence, %) + + JRC + 2019 + + 2019-07-09 + + Dataset + + The favourable feeding habitat of the Atlantic bluefin tuna is daily identified linking their ecological traits with environmental variables from satellite remote sensing and physical ocean models. The feeding habitat is mostly related to the occurrence of productive oceanic features (such as eddies) that are detected by satellite sensors of ocean colour (chlorophyll-a fronts). The physical variables used are sea surface temperature and sea surface height anomaly. More information: https://fishreg.jrc.ec.europa.eu/fish-habitat, Peer-reviewed publication: http://www.sciencedirect.com/science/article/pii/S0079661116000070 + + + protected area + environmental monitoring + oceanography + ocean + Environmental Monitoring Facilities + Environment + Protected Sites + Oceanographic Geographical Features + environmental data + http://publications.europa.eu/resource/authority/data-theme/TECH + http://publications.europa.eu/resource/authority/data-theme/ENVI + http://eurovoc.europa.eu/2114 + http://eurovoc.europa.eu/2107 + http://eurovoc.europa.eu/4801 + http://eurovoc.europa.eu/3140 + + + http://publications.europa.eu/resource/authority/file-type/OP_DATPRO + + + + http://publications.europa.eu/resource/authority/country/ESH + + + http://publications.europa.eu/resource/authority/country/ESP + + + http://publications.europa.eu/resource/authority/country/EST + + + http://publications.europa.eu/resource/authority/country/ETH + + + http://publications.europa.eu/resource/authority/country/DNK + + + http://publications.europa.eu/resource/authority/country/DZA + + + http://publications.europa.eu/resource/authority/country/EGY + + + http://publications.europa.eu/resource/authority/country/ERI + + + http://publications.europa.eu/resource/authority/country/FIN + + + http://publications.europa.eu/resource/authority/country/FRA + + + http://publications.europa.eu/resource/authority/country/1A0 + + + http://publications.europa.eu/resource/authority/country/AUT + + + http://publications.europa.eu/resource/authority/country/BGR + + + http://publications.europa.eu/resource/authority/country/MDA + + + http://publications.europa.eu/resource/authority/country/MCO + + + http://publications.europa.eu/resource/authority/country/LUX + + + http://publications.europa.eu/resource/authority/country/LTU + + + http://publications.europa.eu/resource/authority/country/MAR + + + http://publications.europa.eu/resource/authority/country/LVA + + + http://publications.europa.eu/resource/authority/country/LBN + + + http://publications.europa.eu/resource/authority/country/JOR + + + http://publications.europa.eu/resource/authority/country/LIE + + + http://publications.europa.eu/resource/authority/country/LBY + + + http://publications.europa.eu/resource/authority/country/TUN + + + http://publications.europa.eu/resource/authority/country/TUR + + + http://publications.europa.eu/resource/authority/country/SVK + + + http://publications.europa.eu/resource/authority/country/SVN + + + http://publications.europa.eu/resource/authority/country/SRB + + + http://publications.europa.eu/resource/authority/country/SSD + + + http://publications.europa.eu/resource/authority/country/TCD + + + http://publications.europa.eu/resource/authority/country/TGO + + + http://publications.europa.eu/resource/authority/country/SWE + + + http://publications.europa.eu/resource/authority/country/SYR + + + http://publications.europa.eu/resource/authority/country/AND + + + http://publications.europa.eu/resource/authority/country/BFA + + + http://publications.europa.eu/resource/authority/country/GRC + + + http://publications.europa.eu/resource/authority/country/GNB + + + http://publications.europa.eu/resource/authority/country/GGY + + + http://publications.europa.eu/resource/authority/country/GEO + + + http://publications.europa.eu/resource/authority/country/GBR + + + http://publications.europa.eu/resource/authority/country/FRO + + + http://publications.europa.eu/resource/authority/country/GMB + + + http://publications.europa.eu/resource/authority/country/GIN + + + http://publications.europa.eu/resource/authority/country/GIB + + + http://publications.europa.eu/resource/authority/country/GHA + + + http://publications.europa.eu/resource/authority/country/MKD + + + http://publications.europa.eu/resource/authority/country/MLI + + + http://publications.europa.eu/resource/authority/country/MLT + + + http://publications.europa.eu/resource/authority/country/MNE + + + http://publications.europa.eu/resource/authority/country/MRT + + + http://publications.europa.eu/resource/authority/country/NER + + + http://publications.europa.eu/resource/authority/country/NGA + + + http://publications.europa.eu/resource/authority/country/NLD + + + http://publications.europa.eu/resource/authority/country/NOR + + + http://publications.europa.eu/resource/authority/country/POL + + + http://publications.europa.eu/resource/authority/country/ALB + + + http://publications.europa.eu/resource/authority/country/BEN + + + http://publications.europa.eu/resource/authority/country/VAT + + + http://publications.europa.eu/resource/authority/country/UKR + + + http://publications.europa.eu/resource/authority/country/CAF + + + http://publications.europa.eu/resource/authority/country/BLR + + + http://publications.europa.eu/resource/authority/country/CIV + + + http://publications.europa.eu/resource/authority/country/CHE + + + http://publications.europa.eu/resource/authority/country/CPV + + + http://publications.europa.eu/resource/authority/country/CMR + + + http://publications.europa.eu/resource/authority/country/CZE + + + http://publications.europa.eu/resource/authority/country/CYP + + + http://publications.europa.eu/resource/authority/country/DJI + + + http://publications.europa.eu/resource/authority/country/DEU + + + http://publications.europa.eu/resource/authority/country/ITA + + + http://publications.europa.eu/resource/authority/country/JEY + + + http://publications.europa.eu/resource/authority/country/ISL + + + http://publications.europa.eu/resource/authority/country/ISR + + + http://publications.europa.eu/resource/authority/country/IRL + + + http://publications.europa.eu/resource/authority/country/IRQ + + + http://publications.europa.eu/resource/authority/country/HUN + + + http://publications.europa.eu/resource/authority/country/IMN + + + http://publications.europa.eu/resource/authority/country/GRL + + + http://publications.europa.eu/resource/authority/country/HRV + + + http://publications.europa.eu/resource/authority/country/ALA + + + http://publications.europa.eu/resource/authority/country/BEL + + + http://publications.europa.eu/resource/authority/country/BIH + + + http://publications.europa.eu/resource/authority/country/SLE + + + http://publications.europa.eu/resource/authority/country/SEN + + + http://publications.europa.eu/resource/authority/country/SDN + + + http://publications.europa.eu/resource/authority/country/SAU + + + http://publications.europa.eu/resource/authority/country/RUS + + + http://publications.europa.eu/resource/authority/country/ROU + + + http://publications.europa.eu/resource/authority/country/PSE + + + http://publications.europa.eu/resource/authority/country/PRT + + + http://publications.europa.eu/resource/authority/country/SOM + + + http://publications.europa.eu/resource/authority/country/SMR + + + + http://data.europa.eu/88u/dataset/60a0a5b0-b63c-473f-b8bc-207ea037eb3b + EU + OPEN + 0021 + 2019-07-09 + + + CC_BY_4_0 + ENG + + + + + https%3A%2F%2Fdata.europa.eu%2Feuodp%2Fdata%2Fapiodp%2Faction%2Fpackage_search + + + + + + + false + false + 0.9 + + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml index 8f69a5e2d..93349f3c9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_claim_crossref.xml @@ -46,6 +46,7 @@ 10.1080/23744235.2020.1774644 Infectious Diseases + corda__h2020::814530 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml new file mode 100644 index 000000000..53256bed0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid-noauthor.xml @@ -0,0 +1,117 @@ + + + + r3f52792889d::00002412cb25f2f3047712d00ab2c8eb + hdl:11858/00-1734-0000-0003-EE73-2 + 2020-12-16T10:04:03.148Z + r3f52792889d + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + 2020-12-16T16:02:37.562Z + + + + hdl:11858/00-1734-0000-0003-EE73-2 + + + + + + + + Auf dem Trocknen + Detlev von Liliencron: Gute Nacht. Hinterlassene Gedichte, Berlin: Schuster & Loeffler, 1909. + + TextGrid + 2012 + + + tvitt@textgrid.de + + + Digitale Bibliothek + TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c + + + + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z + + + + textgrid:rn8z.0 + http://hdl.handle.net/hdl:11858/00-1734-0000-0003-EE73-2 + + + hdl:11858/00-1734-0000-0003-EE72-4 + + + 527 Bytes + + + text/tg.edition+tg.aggregation+xml + + 0 + + Der annotierte Datenbestand der Digitalen Bibliothek inklusive + Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung + des Datenbestandes von www.editura.de durch TextGrid und werden + unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland + Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich + nicht auf die der Annotation zu Grunde liegenden allgemeinfreien + Texte (Siehe auch Punkt 2 der Lizenzbestimmungen). + + + + + + + + Berlin + + + + hdl:11858/00-1734-0000-0003-EE73-2 + 0021 + 0002 + 2012-01-29 + OPEN + http://creativecommons.org/licenses/by/3.0/de/legalcode + und + DE + + + + + + + https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + + + + + false + false + 0.9 + + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml index eddbc1ec4..7ad7c4d6b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml @@ -6,29 +6,29 @@ xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> - r3f52792889d::000051aa1f61d77d2c0b340091f8024e - textgrid:q9cv.0 - 2020-11-17T09:34:11.128+01:00 + r3f52792889d::00002412cb25f2f3047712d00ab2c8eb + hdl:11858/00-1734-0000-0003-EE73-2 + 2020-12-16T10:04:03.148Z r3f52792889d - textgrid:q9cv.0 - 2012-01-21T13:35:20Z - 2020-11-17T19:08:56.703+01:00 + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + 2020-12-16T16:02:37.562Z - hdl:11858/00-1734-0000-0003-7664-F + hdl:11858/00-1734-0000-0003-EE73-2 - Hoffmann von Fallersleben, August Heinrich - 118552589 + Liliencron, Detlev von + 118572954 - Mailied - August Heinrich Hoffmann von Fallersleben: Unpolitische Lieder von Hoffmann von Fallersleben, 1. + 2. Theil, 1. Theil, Hamburg: Hoffmann und Campe, 1841. + Auf dem Trocknen + Detlev von Liliencron: Gute Nacht. Hinterlassene Gedichte, Berlin: Schuster & Loeffler, 1909. TextGrid 2012 @@ -38,21 +38,21 @@ Digitale Bibliothek - TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c + TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c - 2012-01-21T13:35:20Z - 2012-01-21T13:35:20Z - 2012-01-21T13:35:20Z + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z + 2012-01-29T20:54:12Z - textgrid:q9cv.0 - http://hdl.handle.net/hdl:11858/00-1734-0000-0003-7664-F + textgrid:rn8z.0 + http://hdl.handle.net/hdl:11858/00-1734-0000-0003-EE73-2 - hdl:11858/00-1734-0000-0003-7666-B + hdl:11858/00-1734-0000-0003-EE72-4 527 Bytes @@ -77,17 +77,18 @@ Hamburg + xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Berlin - hdl:11858/00-1734-0000-0003-7664-F + hdl:11858/00-1734-0000-0003-EE73-2 0021 0002 - 2012-01-21 + 2012-01-29 OPEN http://creativecommons.org/licenses/by/3.0/de/legalcode und + DE @@ -95,11 +96,11 @@ xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> - + https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai - textgrid:q9cv.0 - 2012-01-21T13:35:20Z - http://schema.datacite.org/oai/oai-1.0/ + textgrid:rn8z.0 + 2012-01-29T20:54:12Z + @@ -107,9 +108,10 @@ false 0.9 - - \ No newline at end of file + + diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 705160a2b..8043236e0 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -1,15 +1,15 @@ package eu.dnetlib.dhp.export +import com.fasterxml.jackson.databind.ObjectMapper + import java.time.LocalDateTime import java.time.format.DateTimeFormatter - import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils -import org.codehaus.jackson.map.ObjectMapper import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._ import scala.collection.JavaConverters._ diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala index f1e374f95..3f632af22 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/SparkExportContentForOpenAire.scala @@ -1,27 +1,21 @@ package eu.dnetlib.dhp.`export` import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} import org.apache.commons.io.IOUtils import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapred.SequenceFileOutputFormat -import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.expressions.Window -import org.apache.spark.{SparkConf, SparkContext} -import org.codehaus.jackson.map.ObjectMapper +import org.apache.spark.SparkConf import scala.collection.mutable.ArrayBuffer -import scala.collection.JavaConverters._ object SparkExportContentForOpenAire { - - - def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json"))) @@ -178,11 +172,4 @@ object SparkExportContentForOpenAire { fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) } - - - - - - - } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java index 1b5849f35..7598fd957 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DropAndCreateESIndex.java @@ -10,7 +10,8 @@ import org.apache.http.client.methods.HttpPut; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import org.codehaus.jackson.map.ObjectMapper; + +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala index 81bdb2941..8133666a6 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkConvertDatasetToJson.scala @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.provision +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.provision.scholix.Scholix import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary @@ -7,7 +8,6 @@ import org.apache.commons.io.IOUtils import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SparkSession} -import org.codehaus.jackson.map.ObjectMapper object SparkConvertDatasetToJson { diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 1547056b9..0d44d8e5e 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -54,6 +54,13 @@ spark-solr + + + junit + junit + 4.12 + test + org.apache.solr solr-test-framework @@ -140,6 +147,12 @@ org.apache.zookeeper zookeeper + + + junit + junit + + diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 466c6a9e4..612e7db06 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -27,6 +27,7 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -272,11 +273,7 @@ public class CreateRelatedEntitiesJob_phase2 { .filter(Objects::nonNull) .map(Qualifier::getClassid) .filter(StringUtils::isNotBlank) - .anyMatch(c -> "orcid".equals(c.toLowerCase())); - } - - private static FilterFunction filterEmptyEntityFn() { - return (FilterFunction) v -> Objects.nonNull(v.getEntity()); + .anyMatch(c -> c.toLowerCase().contains(ModelConstants.ORCID)); } private static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/AuthorPidTypeComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/AuthorPidTypeComparator.java new file mode 100644 index 000000000..7391569ed --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/AuthorPidTypeComparator.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.oa.provision.utils; + +import java.util.Comparator; +import java.util.Optional; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class AuthorPidTypeComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + String lClass = Optional + .ofNullable(left) + .map(StructuredProperty::getQualifier) + .map(Qualifier::getClassid) + .orElse(null); + + String rClass = Optional + .ofNullable(right) + .map(StructuredProperty::getQualifier) + .map(Qualifier::getClassid) + .orElse(null); + + if (lClass == null && rClass == null) + return 0; + if (lClass == null) + return 1; + if (rClass == null) + return -1; + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(ModelConstants.ORCID)) + return -1; + if (rClass.equals(ModelConstants.ORCID)) + return 1; + + if (lClass.equals(ModelConstants.ORCID_PENDING)) + return -1; + if (rClass.equals(ModelConstants.ORCID_PENDING)) + return 1; + + return 0; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java index 0e742365a..d2131ef28 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java @@ -7,13 +7,16 @@ import java.util.Set; import com.google.common.collect.Sets; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class GraphMappingUtils { public static final String SEPARATOR = "_"; - public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); + public static Set authorPidTypes = Sets + .newHashSet( + ModelConstants.ORCID, ModelConstants.ORCID_PENDING, "magidentifier"); public static String removePrefix(final String s) { if (s.contains("|")) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 21b526ab1..173ba326a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -73,7 +73,9 @@ public class TemplateFactory { final Collection fields, final String semanticclass, final String semantischeme, - final DataInfo info) { + final DataInfo info, + final boolean validated, + final String validationDate) { return getTemplate(resources.getRel()) .add("type", type) .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) @@ -86,6 +88,8 @@ public class TemplateFactory { .add( "provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") + .add("validated", validated) + .add("validationdate", validationDate) .render(); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index eba736228..af6081c5d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -254,6 +254,18 @@ public class XmlRecordFactory implements Serializable { p -> p, (p1, p2) -> p1)) .values() + .stream() + .collect( + Collectors + .groupingBy( + p -> p.getValue(), + Collectors + .mapping( + p -> p, + Collectors.minBy(new AuthorPidTypeComparator())))) + .values() + .stream() + .map(op -> op.get()) .forEach( sp -> { String pidType = getAuthorPidType(sp.getQualifier().getClassid()); @@ -1082,9 +1094,12 @@ public class XmlRecordFactory implements Serializable { String.format("missing scheme for: <%s - %s>", type.toString(), targetType)); } final HashSet fields = Sets.newHashSet(mapFields(link, contexts)); + if (rel.getValidated() == null) + rel.setValidated(false); return templateFactory .getRel( - targetType, rel.getTarget(), fields, rel.getRelClass(), scheme, rel.getDataInfo()); + targetType, rel.getTarget(), fields, rel.getRelClass(), scheme, rel.getDataInfo(), rel.getValidated(), + rel.getValidationDate()); } private List listChildren( diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/template/rel.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/template/rel.st index af19ba497..e77a86e1d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/template/rel.st +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/template/rel.st @@ -1,4 +1,5 @@ - $objIdentifier$ +$if(validated)$$else$$endif$ +$objIdentifier$ $metadata:{ it | $it$ }$ \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java index 33def91b3..f57b8dcaf 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java @@ -6,15 +6,13 @@ import org.apache.solr.client.solrj.response.UpdateResponse; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import junit.framework.Assert; - public class SolrAdminApplicationTest extends SolrTest { @Test public void testPing() throws Exception { SolrPingResponse pingResponse = miniCluster.getSolrClient().ping(); log.info("pingResponse: '{}'", pingResponse.getStatus()); - Assert.assertTrue(pingResponse.getStatus() == 0); + Assertions.assertTrue(pingResponse.getStatus() == 0); } @Test diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 992ab26e8..27860ca32 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -5,42 +5,47 @@ import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.io.StringReader; +import java.util.List; import org.apache.commons.io.IOUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; -//TODO to enable it we need to update the joined_entity.json test file -@Disabled public class XmlRecordFactoryTest { private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource"; + private static ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @Test public void testXMLRecordFactory() throws IOException, DocumentException { - String json = IOUtils.toString(getClass().getResourceAsStream("joined_entity.json")); - - assertNotNull(json); - JoinedEntity je = new ObjectMapper().readValue(json, JoinedEntity.class); - assertNotNull(je); - ContextMapper contextMapper = new ContextMapper(); XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, otherDsTypeId); - String xml = xmlRecordFactory.build(je); + Publication p = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); + + String xml = xmlRecordFactory.build(new JoinedEntity<>(p)); assertNotNull(xml); @@ -48,6 +53,77 @@ public class XmlRecordFactoryTest { assertNotNull(doc); + System.out.println(doc.asXML()); + + Assertions.assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid")); + Assertions.assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending")); + + Assertions.assertEquals("0000-0001-9613-9956", doc.valueOf("//creator[@rank = '2']/@orcid")); + Assertions.assertEquals("", doc.valueOf("//creator[@rank = '2']/@orcid_pending")); + // TODO add assertions based of values extracted from the XML record } + + @Test + public void testXMLRecordFactoryWithValidatedProject() throws IOException, DocumentException { + + ContextMapper contextMapper = new ContextMapper(); + + XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, + otherDsTypeId); + + Publication p = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); + Project pj = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class); + Relation rel = OBJECT_MAPPER + .readValue( + (IOUtils.toString(getClass().getResourceAsStream("relToValidatedProject.json"))), Relation.class); + RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class); + List links = Lists.newArrayList(); + RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject); + links.add(rew); + JoinedEntity je = new JoinedEntity<>(p); + je.setLinks(links); + + String xml = xmlRecordFactory.build(je); + + assertNotNull(xml); + + Document doc = new SAXReader().read(new StringReader(xml)); + assertNotNull(doc); + System.out.println(doc.asXML()); + Assertions.assertEquals("2021-01-01", doc.valueOf("//validated/@date")); + } + + @Test + public void testXMLRecordFactoryWithNonValidatedProject() throws IOException, DocumentException { + + ContextMapper contextMapper = new ContextMapper(); + + XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, + otherDsTypeId); + + Publication p = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); + Project pj = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class); + Relation rel = OBJECT_MAPPER + .readValue((IOUtils.toString(getClass().getResourceAsStream("relToProject.json"))), Relation.class); + RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class); + List links = Lists.newArrayList(); + RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject); + links.add(rew); + JoinedEntity je = new JoinedEntity<>(p); + je.setLinks(links); + + String xml = xmlRecordFactory.build(je); + + assertNotNull(xml); + + Document doc = new SAXReader().read(new StringReader(xml)); + assertNotNull(doc); + System.out.println(doc.asXML()); + assertEquals("", doc.valueOf("//rel/validated")); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index 1f5cf7b81..f92b63dfd 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -61,7 +61,7 @@ - + @@ -72,6 +72,7 @@ + @@ -79,6 +80,7 @@ + diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/project.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/project.json new file mode 100644 index 000000000..b61e55d1a --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/project.json @@ -0,0 +1,109 @@ +{ + "id": "40|corda__h2020::79a0e16c122c9a18eb60e4a5e64b620d", + "originalId": [], + "pid": [], + "dateofcollection": "2020-01-01", + "dateoftransformation": "2020-01-01", + "extraInfo": [], + "oaiprovenance": null, + "websiteurl": { + "value": "https://web.site", + "datainfo": null + }, + "code": { + "value": "79a0e", + "datainfo": null + }, + "acronym": { + "value": "79a0e_acronym", + "datainfo": null + }, + "title": { + "value": "79a0e_title", + "datainfo": null + }, + "startdate": { + "value": "2019-02-01", + "datainfo": null + }, + "enddate": { + "value": "2021-01-09", + "datainfo": null + }, + "callidentifier": { + "value": "79a0e_callID", + "datainfo": null + }, + "keywords": { + "value": "", + "datainfo": null + }, + "duration": { + "value": "", + "datainfo": null + }, + "ecsc39": { + "value": "true", + "datainfo": null + }, + "oamandatepublications": { + "value": "true", + "datainfo": null + }, + "ecarticle29_3": { + "value": "false", + "datainfo": null + }, + "optional1": { + "value": "", + "datainfo": null + }, + "optional2": { + "value": "", + "datainfo": null + }, + "jsonextrainfo":{ + "value": "", + "datainfo": null + }, + "contactfullname":{ + "value": "", + "datainfo": null + }, + "contactfax": { + "value": "", + "datainfo": null + }, + "contactphone": { + "value": "", + "datainfo": null + }, + "contactemail": { + "value": "", + "datainfo": null + }, + "summary": { + "value": "79a0e_description", + "datainfo": null + }, + "currency": { + "value": "EUR", + "datainfo": null + }, + "totalcost": 120000, + "fundedamount": 18000, + "h2020topiccode": "", + "h2020topicdescription": "", + "h2020classification": [], + "subjects": [ + { + "value": "", + "qualifier": null, + "datainfo": null + } + ], + "fundingtree": [] + +} + + diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json new file mode 100644 index 000000000..ea7a30051 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json @@ -0,0 +1,820 @@ +{ + "author": [ + { + "affiliation": [], + "fullname": "Lee, Jaehyun", + "name": "Jaehyun", + "pid": [ + { + "qualifier": { + "classid": "orcid", + "classname": "Open Researcher and Contributor ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6638" + }, + { + "qualifier": { + "classid": "orcid_pending", + "classname": "Open Researcher and Contributor ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-6639" + } + ], + "rank": 1, + "surname": "Lee" + }, + { + "affiliation": [], + "fullname": "Berrada, Salim", + "name": "Salim", + "pid": [ + { + "qualifier": { + "classid": "orcid", + "classname": "Open Researcher and Contributor ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-9956" + }, + { + "qualifier": { + "classid": "orcid_pending", + "classname": "Open Researcher and Contributor ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "0000-0001-9613-9956" + } + ], + "rank": 2, + "surname": "Berrada" + }, + { + "affiliation": [], + "fullname": "Adamu-Lema, Fikru", + "name": "Fikru", + "pid": [], + "rank": 3, + "surname": "Adamu-Lema" + }, + { + "affiliation": [], + "fullname": "Nagy, Nicole", + "name": "Nicole", + "pid": [], + "rank": 4, + "surname": "Nagy" + }, + { + "affiliation": [], + "fullname": "Georgiev, Vihar P.", + "name": "Vihar P.", + "pid": [], + "rank": 5, + "surname": "Georgiev" + }, + { + "affiliation": [], + "fullname": "Sadi, Toufik", + "name": "Toufik", + "pid": [], + "rank": 6, + "surname": "Sadi" + }, + { + "affiliation": [], + "fullname": "Liang, Jie", + "name": "Jie", + "pid": [], + "rank": 7, + "surname": "Liang" + }, + { + "affiliation": [], + "fullname": "Ramos, Raphael", + "name": "Raphael", + "pid": [], + "rank": 8, + "surname": "Ramos" + }, + { + "affiliation": [], + "fullname": "Carrillo-Nunez, Hamilton", + "name": "Hamilton", + "pid": [], + "rank": 9, + "surname": "Carrillo-Nunez" + }, + { + "affiliation": [], + "fullname": "Kalita, Dipankar", + "name": "Dipankar", + "pid": [], + "rank": 10, + "surname": "Kalita" + }, + { + "affiliation": [], + "fullname": "Lilienthal, Katharina", + "name": "Katharina", + "pid": [], + "rank": 11, + "surname": "Lilienthal" + }, + { + "affiliation": [], + "fullname": "Wislicenus, Marcus", + "name": "Marcus", + "pid": [], + "rank": 12, + "surname": "Wislicenus" + }, + { + "affiliation": [], + "fullname": "Pandey, Reeturaj", + "name": "Reeturaj", + "pid": [], + "rank": 13, + "surname": "Pandey" + }, + { + "affiliation": [], + "fullname": "Chen, Bingan", + "name": "Bingan", + "pid": [], + "rank": 14, + "surname": "Chen" + }, + { + "affiliation": [], + "fullname": "Teo, Kenneth B.K.", + "name": "Kenneth B. K.", + "pid": [], + "rank": 15, + "surname": "Teo" + }, + { + "affiliation": [], + "fullname": "Goncalves, Goncalo", + "name": "Goncalo", + "pid": [], + "rank": 16, + "surname": "Goncalves" + }, + { + "affiliation": [], + "fullname": "Okuno, Hanako", + "name": "Hanako", + "pid": [], + "rank": 17, + "surname": "Okuno" + }, + { + "affiliation": [], + "fullname": "Uhlig, Benjamin", + "name": "Benjamin", + "pid": [], + "rank": 18, + "surname": "Uhlig" + }, + { + "affiliation": [], + "fullname": "Todri-Sanial, Aida", + "name": "Aida", + "pid": [], + "rank": 19, + "surname": "Todri-Sanial" + }, + { + "affiliation": [], + "fullname": "Dijon", + "name": "", + "pid": [], + "rank": 20, + "surname": "" + }, + { + "affiliation": [], + "fullname": "Jean", + "name": "", + "pid": [], + "rank": 21, + "surname": "" + } + ], + "collectedfrom": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + } + ], + "context": [], + "contributor": [], + "country": [], + "coverage": [], + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "value": "2018-01-01" + }, + "dateofcollection": "2020-01-27T11:32:33.729Z", + "dateoftransformation": "2020-01-27T12:03:59.662Z", + "description": [], + "embargoenddate": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "value": "" + }, + "extraInfo": [], + "format": [], + "fulltext": [], + "id": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c", + "instance": [ + { + "accessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "value": "2018-01-01" + }, + "distributionlocation": "", + "hostedby": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "0001", + "classname": "Article", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "license": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "value": "" + }, + "url": [ + "http://juuli.fi/Record/0331473718", + "http://dx.doi.org/10.1109/TED.2018.2853550" + ] + } + ], + "journal": { + "conferencedate": "", + "conferenceplace": "", + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "edition": "", + "ep": "3892", + "iss": "9", + "issnLinking": "", + "issnOnline": "", + "issnPrinted": "0018-9383", + "name": "IEEE Transactions on Electron Devices", + "sp": "3884", + "vol": "65" + }, + "language": { + "classid": "en", + "classname": "en", + "schemeid": "dnet:languages", + "schemename": "dnet:languages" + }, + "lastupdatetimestamp": 0, + "originalId": [ + "0331473718", + "10.1109/TED.2018.2853550", + "http://juuli.fi/Record/0331473718" + ], + "pid": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1109/TED.2018.2853550" + } + ], + "publisher": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "value": "" + }, + "relevantdate": [], + "resourcetype": { + "classid": "0001", + "classname": "Article", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "resulttype": { + "classid": "publication", + "classname": "publication", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "source": [], + "subject": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "http://finto.fi/okm-tieteenala/en/", + "classname": "finto", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "ta114" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Conductivity" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Contacts" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Cu-carbon nanotubes (CNT) composites" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "density functional theory (DFT)" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Discrete Fourier transforms" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Electromigration" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "electromigration (EM)" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "electrothermal" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "interconnects" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Lattices" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "multiscale simulation" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Resistance" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "self-heating." + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "Thermal conductivity" + } + ], + "title": [ + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "Understanding Electromigration in Cu-CNT Composite Interconnects A Multiscale Electrothermal Simulation Study" + } + ] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml index b617dbea2..2d6049416 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml @@ -39,6 +39,8 @@ Saykally, Jessica N. Keeley, Kristen L. Haris Hatic + Baglioni, Miriam + De Bonis, Michele 2017-06-01 Withania somnifera has been used in traditional medicine for a variety of neural disorders. Recently, chronic neurodegenerative conditions have been @@ -115,7 +117,7 @@ Cell Transplantation - + Cell Transplantation @@ -292,6 +294,28 @@ >rcuk________::23feba2a5ca7f6b6016bf3a45180da50 University of Delhi + + true + corda_______::30c6b5ab90f30666de1d112fb93d8c77 + 227878 + + + ec__________::EC::FP7 + ec__________::EC::FP7::SP2 + ec__________::EC::FP7::SP2::ERC + + Complex structure and dynamics of collective motion + COLLMOT + + + + irb_hr______::2330a1d0dac71ffbe15fbcbc807288d4 + 108-1083570-3635 + + + + Pentadecapeptide BPC 157 - further investigations + diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToProject.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToProject.json new file mode 100644 index 000000000..9e2824e52 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToProject.json @@ -0,0 +1,31 @@ +{ +"collectedfrom": [ +{ +"key": "10|opendoar____::eccbc87e4b5ce2fe28308fd9f2a7baf3", +"value": "AMS Acta", +"dataInfo": null +} +], +"dataInfo": { +"invisible": false, +"inferred": false, +"deletedbyinference": false, +"trust": "0.9", +"inferenceprovenance": "", +"provenanceaction": { +"classid": "sysimport:crosswalk:repository", +"classname": "sysimport:crosswalk:repository", +"schemeid": "dnet:provenanceActions", +"schemename": "dnet:provenanceActions" +} +}, +"lastupdatetimestamp": 1606898557407, +"relType": "resultProject", +"subRelType": "outcome", +"relClass": "isProducedBy", +"source": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c", +"target": "40|corda__h2020::79a0e16c122c9a18eb60e4a5e64b620d", +"validated": null, +"validationDate": null, +"properties": [] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToValidatedProject.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToValidatedProject.json new file mode 100644 index 000000000..0346d7264 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/relToValidatedProject.json @@ -0,0 +1,31 @@ +{ +"collectedfrom": [ +{ +"key": "10|opendoar____::eccbc87e4b5ce2fe28308fd9f2a7baf3", +"value": "AMS Acta", +"dataInfo": null +} +], +"dataInfo": { +"invisible": false, +"inferred": false, +"deletedbyinference": false, +"trust": "0.9", +"inferenceprovenance": "", +"provenanceaction": { +"classid": "sysimport:crosswalk:repository", +"classname": "sysimport:crosswalk:repository", +"schemeid": "dnet:provenanceActions", +"schemename": "dnet:provenanceActions" +} +}, +"lastupdatetimestamp": 1606898557407, +"relType": "resultProject", +"subRelType": "outcome", +"relClass": "isProducedBy", +"source": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c", +"target": "40|corda__h2020::79a0e16c122c9a18eb60e4a5e64b620d", +"validated": true, +"validationDate": "2021-01-01", +"properties": [] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh new file mode 100644 index 000000000..57acb2ee7 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/finalizedb.sh @@ -0,0 +1,18 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export SHADOW=$2 + +echo "Updating shadow database" +impala-shell -d ${SOURCE} -q "invalidate metadata" +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh deleted file mode 100644 index 70112dc7b..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh +++ /dev/null @@ -1,18 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -echo "Getting file from " $3 -hdfs dfs -copyToLocal $3 - -echo "Running impala shell make the new database visible" -impala-shell -q "INVALIDATE METADATA;" - -echo "Running impala shell to compute new table stats" -impala-shell -d $1 -f $2 -echo "Impala shell finished" -rm $2 diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh new file mode 100644 index 000000000..c5bda6d39 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -0,0 +1,25 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +echo "Creating monitor database" +cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - +echo "Impala shell finished" + +echo "Updating shadow monitor database" +impala-shell -q "create database if not exists ${SHADOW}" +impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - +impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - +echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 46ff295f4..6c96317e6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -3,14 +3,37 @@ -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref; -CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country; -CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp; -CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap; -CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; -CREATE OR REPLACE VIEW ${stats_db_name}.context AS SELECT * FROM ${external_stats_db_name}.context; -CREATE OR REPLACE VIEW ${stats_db_name}.category AS SELECT * FROM ${external_stats_db_name}.category; -CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_stats_db_name}.concept; +CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS +SELECT * +FROM ${external_stats_db_name}.fundref; + +CREATE OR REPLACE VIEW ${stats_db_name}.country AS +SELECT * +FROM ${external_stats_db_name}.country; + +CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS +SELECT * +FROM ${external_stats_db_name}.countrygdp; + +CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS +SELECT * +FROM ${external_stats_db_name}.roarmap; + +CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS +SELECT * +FROM ${external_stats_db_name}.rndexpediture; + +CREATE OR REPLACE VIEW ${stats_db_name}.context AS +SELECT * +FROM ${external_stats_db_name}.context; + +CREATE OR REPLACE VIEW ${stats_db_name}.category AS +SELECT * +FROM ${external_stats_db_name}.category; + +CREATE OR REPLACE VIEW ${stats_db_name}.concept AS +SELECT * +FROM ${external_stats_db_name}.concept; ------------------------------------------------------------------------------------------------ @@ -18,4 +41,8 @@ CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_sta -- Creation date of the database ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date; \ No newline at end of file +create table ${stats_db_name}.creation_date as +select date_format(current_date(), 'dd-MM-yyyy') as date; +-- +-- ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.creation_date COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index 13e141459..d26169fd6 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -5,40 +5,114 @@ ---------------------------------------------------------------- --Datasource temporary table updates -UPDATE ${stats_db_name}.datasource_tmp SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd WHERE d.id=rd.datasource); +UPDATE ${stats_db_name}.datasource_tmp +SET harvested='true' +WHERE datasource_tmp.id IN (SELECT DISTINCT d.id + FROM ${stats_db_name}.datasource_tmp d, + ${stats_db_name}.result_datasources rd + WHERE d.id = rd.datasource); -- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables -UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication'); +UPDATE ${stats_db_name}.project_tmp +SET haspubs='yes' +WHERE project_tmp.id IN (SELECT pr.id + FROM ${stats_db_name}.project_results pr, + ${stats_db_name}.result r + WHERE pr.result = r.id + AND r.type = 'publication'); -DROP TABLE IF EXISTS ${stats_db_name}.project; CREATE TABLE ${stats_db_name}.project stored as parquet as -SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration, -CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, -CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, -CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, -CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, -p.callidentifier, p.code -FROM ${stats_db_name}.project_tmp p -LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np - FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id - WHERE r.type='publication' - GROUP BY pr.id) AS prr1 on prr1.id = p.id -LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) AS daysForlastPub , count(distinct r.id) AS dp - FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r - WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 - GROUP BY pp.id) AS prr2 - ON prr2.id = p.id; - --- Publication temporary table updates -UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); +SELECT p.id, + p.acronym, + p.title, + p.funder, + p.funding_lvl0, + p.funding_lvl1, + p.funding_lvl2, + p.ec39, + p.type, + p.startdate, + p.enddate, + p.start_year, + p.end_year, + p.duration, + CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs, + CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs, + CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub, + CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, + p.callidentifier, + p.code +FROM ${stats_db_name}.project_tmp p + LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np + FROM ${stats_db_name}.project_results pr + INNER JOIN ${stats_db_name}.result r ON pr.result = r.id + WHERE r.type = 'publication' + GROUP BY pr.id) AS prr1 on prr1.id = p.id + LEFT JOIN (SELECT pp.id, + max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub, + count(distinct r.id) AS dp + FROM ${stats_db_name}.project_tmp pp, + ${stats_db_name}.project_results pr, + ${stats_db_name}.result r + WHERE pp.id = pr.id + AND pr.result = r.id + AND r.type = 'publication' + AND datediff(to_date(r.date), to_date(pp.enddate)) > 0 + GROUP BY pp.id) AS prr2 + ON prr2.id = p.id; --- Dataset temporary table updates -UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); +UPDATE ${stats_db_name}.publication_tmp +SET delayed = 'yes' +WHERE publication_tmp.id IN (SELECT distinct r.id + FROM stats_wf_db_obs.result r, + ${stats_db_name}.project_results pr, + ${stats_db_name}.project_tmp p + WHERE r.id = pr.result + AND pr.id = p.id + AND to_date(r.date) - to_date(p.enddate) > 0); --- Software temporary table updates -UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); +UPDATE ${stats_db_name}.dataset_tmp +SET delayed = 'yes' +WHERE dataset_tmp.id IN (SELECT distinct r.id + FROM stats_wf_db_obs.result r, + ${stats_db_name}.project_results pr, + ${stats_db_name}.project_tmp p + WHERE r.id = pr.result + AND pr.id = p.id + AND to_date(r.date) - to_date(p.enddate) > 0); --- Oherresearchproduct temporary table updates -UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0); +UPDATE ${stats_db_name}.software_tmp +SET delayed = 'yes' +WHERE software_tmp.id IN (SELECT distinct r.id + FROM ${stats_db_name}.result r, + ${stats_db_name}.project_results pr, + ${stats_db_name}.project_tmp p + WHERE r.id = pr.result + AND pr.id = p.id + AND to_date(r.date) - to_date(p.enddate) > 0); -CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project; +UPDATE ${stats_db_name}.otherresearchproduct_tmp +SET delayed = 'yes' +WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id + FROM ${stats_db_name}.result r, + ${stats_db_name}.project_results pr, + ${stats_db_name}.project_tmp p + WHERE r.id = pr.result + AND pr.id = p.id + AND to_date(r.date) - to_date(p.enddate) > 0); + +CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS +SELECT result_projects.id AS result, + result_projects.project AS project_results, + result.date as resultdate, + project.enddate as projectenddate, + result_projects.daysfromend AS daysfromend +FROM ${stats_db_name}.result_projects, + ${stats_db_name}.result, + ${stats_db_name}.project +WHERE result_projects.id = result.id + AND result.type = 'publication' + AND project.id = result_projects.project; + +-- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql index 25439852e..51d3a73c9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step12.sql @@ -1,21 +1,25 @@ ------------------------------------------------------------------------------------------------------ -- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables ------------------------------------------------------------------------------------------------------ +CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS +SELECT * +FROM ${stats_db_name}.datasource_tmp; -DROP TABLE IF EXISTS ${stats_db_name}.datasource; -CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp; +CREATE TABLE ${stats_db_name}.publication stored AS parquet AS +SELECT * +FROM ${stats_db_name}.publication_tmp; -DROP TABLE IF EXISTS ${stats_db_name}.publication; -CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp; +CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS +SELECT * +FROM ${stats_db_name}.dataset_tmp; -DROP TABLE IF EXISTS ${stats_db_name}.dataset; -CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp; +CREATE TABLE ${stats_db_name}.software stored AS parquet AS +SELECT * +FROM ${stats_db_name}.software_tmp; -DROP TABLE IF EXISTS ${stats_db_name}.software; -CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp; - -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct; -CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp; +CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS +SELECT * +FROM ${stats_db_name}.otherresearchproduct_tmp; DROP TABLE ${stats_db_name}.project_tmp; DROP TABLE ${stats_db_name}.datasource_tmp; @@ -29,10 +33,47 @@ DROP TABLE ${stats_db_name}.otherresearchproduct_tmp; --------------------------------------------- -- Result -CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct; +CREATE OR REPLACE VIEW ${stats_db_name}.result AS +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.publication +UNION ALL +SELECT *, bestlicence as access_mode +FROM ${stats_db_name}.software +UNION ALL +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.dataset +UNION ALL +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.otherresearchproduct; ------------------------------------------------------------------------------- -- To see with Antonis if the following is needed and where it should be placed ------------------------------------------------------------------------------- -CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country; +CREATE TABLE ${stats_db_name}.numbers_country AS +SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications +FROM ${stats_db_name}.result r, + ${stats_db_name}.result_datasources rd, + ${stats_db_name}.datasource d, + ${stats_db_name}.datasource_organizations dor, + ${stats_db_name}.organization org +WHERE r.id = rd.id + AND rd.datasource = d.id + AND d.id = dor.id + AND dor.organization = org.id + AND r.type = 'publication' + AND r.bestlicence = 'Open Access' +GROUP BY org.country; + +-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index 795770313..d79396b3b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -57,3 +57,12 @@ UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +-- +-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index 4a56b5d68..00a6913bc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -47,3 +47,16 @@ FROM ( SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id; + +-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_licenses COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_licenses COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_licenses COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.organization_pids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.organization_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 60b37048b..8f364d747 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -34,3 +34,12 @@ union all select * from ${stats_db_name}.software_refereed union all select * from ${stats_db_name}.otherresearchproduct_refereed; +-- +-- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql index 33849b960..833deff73 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16.sql @@ -77,4 +77,15 @@ join ${stats_db_name}.result_datasources rd on rd.id=r.id join ${stats_db_name}.datasource d on d.id=rd.datasource join ${stats_db_name}.datasource_organizations dor on dor.id=d.id join ${stats_db_name}.organization o on o.id=dor.organization -where o.country is not null and o.country!=''; \ No newline at end of file +where o.country is not null and o.country!=''; + +-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql index f737c1ea6..2bdc263ef 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql @@ -52,4 +52,7 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; drop table if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; -drop table ${stats_db_name}.result_tmp; \ No newline at end of file +drop table ${stats_db_name}.result_tmp; +-- +-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql index ced7bbc11..528aaff52 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_6.sql @@ -29,4 +29,11 @@ select rcount.pid, sum(case when rcount.type='publication' then rcount.count els from rcount group by rcount.pid; -create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture \ No newline at end of file +create view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; +-- +-- ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_projectcount COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_fundercount COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project_resultcount COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql deleted file mode 100644 index 5c102d014..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step17.sql +++ /dev/null @@ -1,207 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Shadow schema table exchange ------------------------------------------------------- ------------------------------------------------------- - --- Dropping old views -DROP VIEW IF EXISTS ${stats_db_shadow_name}.category; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.concept; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.context; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.country; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.countrygdp; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.creation_date; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_citations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_classifications; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_concepts; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_licenses; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_refereed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_topics; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_organizations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_results; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.funder; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.fundref; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.numbers_country; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_projects; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_refereed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_organizations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_resultcount; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results_publication; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_citations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_classifications; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_concepts; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_licenses; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_refereed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_topics; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_affiliated_country; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_citations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_classifications; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_concepts; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_deposited_country; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_fundercount; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_gold; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_greenoa; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_licenses; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_organization; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_peerreviewed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projectcount; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projects; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_refereed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_topics; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.rndexpediture; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.roarmap; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_citations; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_classifications; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_concepts; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_datasources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_languages; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_licenses; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_oids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_pids; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_refereed; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_sources; -DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics; - - --- Creating the shadow database, in case it doesn't exist -CREATE database IF NOT EXISTS ${stats_db_shadow_name}; - --- Creating new views -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.category AS SELECT * FROM ${stats_db_name}.category; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.concept AS SELECT * FROM ${stats_db_name}.concept; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.context AS SELECT * FROM ${stats_db_name}.context; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.country AS SELECT * FROM ${stats_db_name}.country; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.funder AS SELECT * FROM ${stats_db_name}.funder; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization AS SELECT * FROM ${stats_db_name}.organization; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project AS SELECT * FROM ${stats_db_name}.project; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication AS SELECT * FROM ${stats_db_name}.publication; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result AS SELECT * FROM ${stats_db_name}.result; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software AS SELECT * FROM ${stats_db_name}.software; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql deleted file mode 100644 index 34e48a18a..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step18.sql +++ /dev/null @@ -1,8 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Impala table statistics - Needed to make the tables --- visible for impala ------------------------------------------------------- ------------------------------------------------------- - -INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql deleted file mode 100644 index 34e48a18a..000000000 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step19.sql +++ /dev/null @@ -1,8 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Impala table statistics - Needed to make the tables --- visible for impala ------------------------------------------------------- ------------------------------------------------------- - -INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index ba0db25be..62a158560 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -5,40 +5,109 @@ -------------------------------------------------------------- -- Publication temporary table -DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp; +CREATE TABLE ${stats_db_name}.publication_tmp +( + id STRING, + title STRING, + publisher STRING, + journal STRING, + date STRING, + year STRING, + bestlicence STRING, + embargo_end_date STRING, + delayed BOOLEAN, + authors INT, + source STRING, + abstract BOOLEAN, + type STRING +) + clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true'); -CREATE TABLE ${stats_db_name}.publication_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true'); - -INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal , -p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence, -p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source, -case when size(p.description) > 0 then true else false end as abstract, -'publication' as type +INSERT INTO ${stats_db_name}.publication_tmp +SELECT substr(p.id, 4) as id, + p.title[0].value as title, + p.publisher.value as publisher, + p.journal.name as journal, + p.dateofacceptance.value as date, + date_format(p.dateofacceptance.value, 'yyyy') as year, + p.bestaccessright.classname as bestlicence, + p.embargoenddate.value as embargo_end_date, + false as delayed, + size(p.author) as authors, + concat_ws('\u003B', p.source.value) as source, + case when size(p.description) > 0 then true else false end as abstract, + 'publication' as type from ${openaire_db_name}.publication p -where p.datainfo.deletedbyinference=false; +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_classifications AS +SELECT substr(p.id, 4) as id, instancetype.classname as type +from ${openaire_db_name}.publication p + LATERAL VIEW explode(p.instance.instancetype) instances as instancetype +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_concepts AS +SELECT substr(p.id, 4) as id, contexts.context.id as concept +from ${openaire_db_name}.publication p + LATERAL VIEW explode(p.context) contexts as context +where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource - FROM ( - SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource - from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance - where p.datainfo.deletedbyinference=false ) p - LEFT OUTER JOIN ( - SELECT substr(d.id, 4) id - from ${openaire_db_name}.datasource d - WHERE d.datainfo.deletedbyinference=false ) d on p.datasource = d.id; +FROM ( + SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource + from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance + where p.datainfo.deletedbyinference = false) p + LEFT OUTER JOIN ( + SELECT substr(d.id, 4) id + from ${openaire_db_name}.datasource d + WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_languages AS +select substr(p.id, 4) as id, p.language.classname as language +FROM ${openaire_db_name}.publication p +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_oids AS +SELECT substr(p.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.publication p + LATERAL VIEW explode(p.originalid) oids AS ids +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_pids AS +SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid +FROM ${openaire_db_name}.publication p + LATERAL VIEW explode(p.pid) pids AS ppid +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.publication_topics as +select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic +FROM ${openaire_db_name}.publication p + LATERAL VIEW explode(p.subject) subjects AS subject +where p.datainfo.deletedbyinference = false; --- Publication_citations -CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and p.datainfo.deletedbyinference=false; \ No newline at end of file +CREATE TABLE ${stats_db_name}.publication_citations AS +SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result +FROM ${openaire_db_name}.publication p + lateral view explode(p.extrainfo) citations AS citation +WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" + and p.datainfo.deletedbyinference = false; + +-- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql new file mode 100644 index 000000000..9477ada12 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -0,0 +1,121 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +create view if not exists TARGET.funder as select * from SOURCE.funder; +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +--create view if not exists TARGET.roarmap as select * from SOURCE.roarmap; + +create table TARGET.result as + select distinct * from ( + select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id) + union all + select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo; +compute stats TARGET.result; + +create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id); +compute stats TARGET.result_affiliated_country; + +create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_citations; + +create table TARGET.result_classifications as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_classifications; + +create table TARGET.result_concepts as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_concepts; + +create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_datasources; + +create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_deposited_country; + +create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_fundercount; + +create table TARGET.result_gold as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_gold; + +create table TARGET.result_greenoa as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_greenoa; + +create table TARGET.result_languages as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_languages; + +create table TARGET.result_licences as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_licences; + +create table TARGET.result_oids as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_oids; + +create table TARGET.result_organization as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_organization; + +create table TARGET.result_peerreviewed as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_peerreviewed; + +create table TARGET.result_pids as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_pids; + +create table TARGET.result_projectcount as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_projectcount; + +create table TARGET.result_projects as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_projects; + +create table TARGET.result_refereed as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_refereed; + +create table TARGET.result_sources as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_sources; + +create table TARGET.result_topics as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_topics; + +-- datasources +create view if not exists TARGET.datasource as select * from SOURCE.datasource; +create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; +create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations; +create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; + +create table TARGET.datasource_results as select id as result, datasource as id from TARGET.result_datasources; +compute stats TARGET.datasource_results; + +-- organizations +create view if not exists TARGET.organization as select * from SOURCE.organization; +create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources; +create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids; +create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects; +create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources; + +-- projects +create view if not exists TARGET.project as select * from SOURCE.project; +create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; +create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; +create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; + +create table TARGET.project_results as select id as result, project as id from TARGET.result_projects; +compute stats TARGET.project_results; + +--denorm +alter table TARGET.result rename to TARGET.res_tmp; + +create table TARGET.result_denorm as + select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype + from TARGET.res_tmp r + join TARGET.result_projects rp on rp.id=r.id + join TARGET.result_datasources rd on rd.id=r.id + join TARGET.project p on p.id=rp.project + join TARGET.datasource d on d.id=rd.datasource; +compute stats TARGET.result_denorm; + +alter table TARGET.result_denorm rename to TARGET.result; +drop table TARGET.res_tmp; +--- done! \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index f69715a31..dcd5ad858 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -5,32 +5,108 @@ ------------------------------------------------------ -- Dataset temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp; -CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true'); +CREATE TABLE ${stats_db_name}.dataset_tmp +( + id STRING, + title STRING, + publisher STRING, + journal STRING, + date STRING, + year STRING, + bestlicence STRING, + embargo_end_date STRING, + delayed BOOLEAN, + authors INT, + source STRING, + abstract BOOLEAN, + type STRING +) + clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal, -d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') AS year, d.bestaccessright.classname AS bestlicence, -d.embargoenddate.value AS embargo_end_date, false AS delayed, size(d.author) AS authors , concat_ws('\u003B',d.source.value) AS source, - CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, -'dataset' AS type +INSERT INTO ${stats_db_name}.dataset_tmp +SELECT substr(d.id, 4) AS id, + d.title[0].value AS title, + d.publisher.value AS publisher, + cast(null AS string) AS journal, + d.dateofacceptance.value as date, + date_format(d.dateofacceptance.value, 'yyyy') AS year, + d.bestaccessright.classname AS bestlicence, + d.embargoenddate.value AS embargo_end_date, + false AS delayed, + size(d.author) AS authors, + concat_ws('\u003B', d.source.value) AS source, + CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract, + 'dataset' AS type FROM ${openaire_db_name}.dataset d -WHERE d.datainfo.deletedbyinference=FALSE; +WHERE d.datainfo.deletedbyinference = FALSE; --- Dataset_citations -CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and d.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_citations AS +SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result +FROM ${openaire_db_name}.dataset d + LATERAL VIEW explode(d.extrainfo) citations AS citation +WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" + and d.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_classifications AS +SELECT substr(p.id, 4) AS id, instancetype.classname AS type +FROM ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_concepts AS +SELECT substr(p.id, 4) as id, contexts.context.id as concept +from ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.context) contexts as context +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource -FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN -(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; +CREATE TABLE ${stats_db_name}.dataset_datasources AS +SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +FROM ( + SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource + FROM ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.instance) instances AS instance + where p.datainfo.deletedbyinference = false) p + LEFT OUTER JOIN ( + SELECT substr(d.id, 4) id + FROM ${openaire_db_name}.datasource d + WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_languages AS +SELECT substr(p.id, 4) AS id, p.language.classname AS language +FROM ${openaire_db_name}.dataset p +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_oids AS +SELECT substr(p.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.originalid) oids AS ids +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_pids AS +SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +FROM ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.pid) pids AS ppid +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.dataset_topics AS +SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +FROM ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.subject) subjects AS subject +where p.datainfo.deletedbyinference = false; +-- +-- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 2c4a625e1..fd5390e66 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -5,32 +5,108 @@ -------------------------------------------------------- -- Software temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.software_tmp; -CREATE TABLE ${stats_db_name}.software_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); +CREATE TABLE ${stats_db_name}.software_tmp +( + id STRING, + title STRING, + publisher STRING, + journal STRING, + date STRING, + year STRING, + bestlicence STRING, + embargo_end_date STRING, + delayed BOOLEAN, + authors INT, + source STRING, + abstract BOOLEAN, + type STRING +) + clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal, -s.dateofacceptance.value AS DATE, date_format(s.dateofacceptance.value,'yyyy') AS YEAR, s.bestaccessright.classname AS bestlicence, -s.embargoenddate.value AS embargo_end_date, FALSE AS delayed, SIZE(s.author) AS authors , concat_ws('\u003B',s.source.value) AS source, - CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, -'software' as type +INSERT INTO ${stats_db_name}.software_tmp +SELECT substr(s.id, 4) as id, + s.title[0].value AS title, + s.publisher.value AS publisher, + CAST(NULL AS string) AS journal, + s.dateofacceptance.value AS DATE, + date_format(s.dateofacceptance.value, 'yyyy') AS YEAR, + s.bestaccessright.classname AS bestlicence, + s.embargoenddate.value AS embargo_end_date, + FALSE AS delayed, + SIZE(s.author) AS authors, + concat_ws('\u003B', s.source.value) AS source, + CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract, + 'software' as type from ${openaire_db_name}.software s -where s.datainfo.deletedbyinference=false; +where s.datainfo.deletedbyinference = false; --- Software_citations -CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and s.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_citations AS +SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT +FROM ${openaire_db_name}.software s + LATERAL VIEW explode(s.extrainfo) citations as citation +where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" + and s.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_classifications AS +SELECT substr(p.id, 4) AS id, instancetype.classname AS type +FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_concepts AS +SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.context) contexts AS context +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource -FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN -(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id; +CREATE TABLE ${stats_db_name}.software_datasources AS +SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource +FROM ( + SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource + FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.instance) instances AS instance + where p.datainfo.deletedbyinference = false) p + LEFT OUTER JOIN ( + SELECT substr(d.id, 4) id + FROM ${openaire_db_name}.datasource d + WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id; -CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_languages AS +select substr(p.id, 4) AS id, p.language.classname AS language +FROM ${openaire_db_name}.software p +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_oids AS +SELECT substr(p.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.originalid) oids AS ids +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_pids AS +SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.pid) pids AS ppid +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.software_topics AS +SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.subject) subjects AS subject +where p.datainfo.deletedbyinference = false; +-- +-- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 1fa5df8cb..b359b596f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -5,32 +5,99 @@ -------------------------------------------------------------------------------- -- Otherresearchproduct temporary table supporting updates -DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp; -CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); +CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp +( + id STRING, + title STRING, + publisher STRING, + journal STRING, + date STRING, + year STRING, + bestlicence STRING, + embargo_end_date STRING, + delayed BOOLEAN, + authors INT, + source STRING, + abstract BOOLEAN, + type STRING +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal, -o.dateofacceptance.value AS DATE, date_format(o.dateofacceptance.value,'yyyy') AS year, o.bestaccessright.classname AS bestlicence, -o.embargoenddate.value as embargo_end_date, FALSE AS delayed, SIZE(o.author) AS authors , concat_ws('\u003B',o.source.value) AS source, -CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, -'other' AS type +INSERT INTO ${stats_db_name}.otherresearchproduct_tmp +SELECT substr(o.id, 4) AS id, + o.title[0].value AS title, + o.publisher.value AS publisher, + CAST(NULL AS string) AS journal, + o.dateofacceptance.value AS DATE, + date_format(o.dateofacceptance.value, 'yyyy') AS year, + o.bestaccessright.classname AS bestlicence, + o.embargoenddate.value as embargo_end_date, + FALSE AS delayed, + SIZE(o.author) AS authors, + concat_ws('\u003B', o.source.value) AS source, + CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract, + 'other' AS type FROM ${openaire_db_name}.otherresearchproduct o -WHERE o.datainfo.deletedbyinference=FALSE; +WHERE o.datainfo.deletedbyinference = FALSE; -- Otherresearchproduct_citations -CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="" and o.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS +SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT +FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation +WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" + and o.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS +SELECT substr(p.id, 4) AS id, instancetype.classname AS type +FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS +SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource -from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance where p.datainfo.deletedbyinference=false) p LEFT OUTER JOIN -(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id; +CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS +SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource +FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource + from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance + where p.datainfo.deletedbyinference = false) p + LEFT OUTER JOIN(SELECT substr(d.id, 4) id + from ${openaire_db_name}.datasource d + WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id; -CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS +SELECT substr(p.id, 4) AS id, p.language.classname AS language +FROM ${openaire_db_name}.otherresearchproduct p +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS +SELECT substr(p.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS +SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid +FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid +where p.datainfo.deletedbyinference = false; -CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject where p.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS +SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic +FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject +where p.datainfo.deletedbyinference = false; + +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index b4745535d..23ef03bc9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -1,30 +1,84 @@ --- noinspection SqlNoDataSourceInspectionForFile - ------------------------------------------------------ ------------------------------------------------------ -- Project table/view and Project related tables/views ------------------------------------------------------ ------------------------------------------------------ --- Project_oids Table -DROP TABLE IF EXISTS ${stats_db_name}.project_oids; -CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.project_oids AS +SELECT substr(p.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.project_organizations AS +SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization +from ${openaire_db_name}.relation r +WHERE r.reltype = 'projectOrganization' + and r.datainfo.deletedbyinference = false; --- Project_organizations Table -DROP TABLE IF EXISTS ${stats_db_name}.project_organizations; -CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization' and r.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.project_results AS +SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result +FROM ${openaire_db_name}.relation r +WHERE r.reltype = 'resultProject' + and r.datainfo.deletedbyinference = false; --- Project_results Table -DROP TABLE IF EXISTS ${stats_db_name}.project_results; -CREATE TABLE ${stats_db_name}.project_results AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result FROM ${openaire_db_name}.relation r WHERE r.reltype='resultProject' and r.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.project_tmp +( + id STRING, + acronym STRING, + title STRING, + funder STRING, + funding_lvl0 STRING, + funding_lvl1 STRING, + funding_lvl2 STRING, + ec39 STRING, + type STRING, + startdate STRING, + enddate STRING, + start_year INT, + end_year INT, + duration INT, + haspubs STRING, + numpubs INT, + daysforlastpub INT, + delayedpubs INT, + callidentifier STRING, + code STRING +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); --- Project table ----------------- --- Creating and populating temporary Project table -DROP TABLE IF EXISTS ${stats_db_name}.project_tmp; -CREATE TABLE ${stats_db_name}.project_tmp (id STRING, acronym STRING, title STRING, funder STRING, funding_lvl0 STRING, funding_lvl1 STRING, funding_lvl2 STRING, ec39 STRING, type STRING, startdate STRING, enddate STRING, start_year INT, end_year INT, duration INT, haspubs STRING, numpubs INT, daysforlastpub INT, delayedpubs INT, callidentifier STRING, code STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); - -INSERT INTO ${stats_db_name}.project_tmp SELECT substr(p.id, 4) AS id, p.acronym.value AS acronym, p.title.value AS title, xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, p.ecsc39.value AS ec39, p.contracttype.classname AS type, p.startdate.value AS startdate, p.enddate.value AS enddate, year(p.startdate.value) AS start_year, year(p.enddate.value) AS end_year, CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, 'no' AS haspubs, 0 AS numpubs, 0 AS daysforlastpub, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference=false; +INSERT INTO ${stats_db_name}.project_tmp +SELECT substr(p.id, 4) AS id, + p.acronym.value AS acronym, + p.title.value AS title, + xpath_string(p.fundingtree[0].value, '//funder/name') AS funder, + xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0, + xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1, + xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2, + p.ecsc39.value AS ec39, + p.contracttype.classname AS type, + p.startdate.value AS startdate, + p.enddate.value AS enddate, + year(p.startdate.value) AS start_year, + year(p.enddate.value) AS end_year, + CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration, + 'no' AS haspubs, + 0 AS numpubs, + 0 AS daysforlastpub, + 0 AS delayedpubs, + p.callidentifier.value AS callidentifier, + p.code.value AS code +FROM ${openaire_db_name}.project p +WHERE p.datainfo.deletedbyinference = false; create table ${stats_db_name}.funder as -select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund +select distinct xpath_string(fund, '//funder/id') as id, + xpath_string(fund, '//funder/name') as name, + xpath_string(fund, '//funder/shortname') as shortname +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; + +-- ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project_organizations COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project_results COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.project_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.funder COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 36a4a8a49..ae540b9b2 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -5,27 +5,137 @@ ---------------------------------------------------- -- Views on temporary tables that should be re-created in the end -CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.software_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.dataset_tmp UNION ALL SELECT *,bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct_tmp; +CREATE OR REPLACE VIEW ${stats_db_name}.result as +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.publication_tmp +UNION ALL +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.software_tmp +UNION ALL +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.dataset_tmp +UNION ALL +SELECT *, bestlicence AS access_mode +FROM ${stats_db_name}.otherresearchproduct_tmp; -- Views on final tables -CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources UNION ALL SELECT * FROM ${stats_db_name}.software_datasources UNION ALL SELECT * FROM ${stats_db_name}.dataset_datasources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; +CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS +SELECT * +FROM ${stats_db_name}.publication_datasources +UNION ALL +SELECT * +FROM ${stats_db_name}.software_datasources +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_datasources +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_datasources; -CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS SELECT * FROM ${stats_db_name}.publication_citations UNION ALL SELECT * FROM ${stats_db_name}.software_citations UNION ALL SELECT * FROM ${stats_db_name}.dataset_citations UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; +CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS +SELECT * +FROM ${stats_db_name}.publication_citations +UNION ALL +SELECT * +FROM ${stats_db_name}.software_citations +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_citations +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_citations; -CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications UNION ALL SELECT * FROM ${stats_db_name}.software_classifications UNION ALL SELECT * FROM ${stats_db_name}.dataset_classifications UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; +CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS +SELECT * +FROM ${stats_db_name}.publication_classifications +UNION ALL +SELECT * +FROM ${stats_db_name}.software_classifications +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_classifications +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_classifications; -CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts UNION ALL SELECT * FROM ${stats_db_name}.software_concepts UNION ALL SELECT * FROM ${stats_db_name}.dataset_concepts UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; +CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS +SELECT * +FROM ${stats_db_name}.publication_concepts +UNION ALL +SELECT * +FROM ${stats_db_name}.software_concepts +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_concepts +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_concepts; -CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS SELECT * FROM ${stats_db_name}.publication_languages UNION ALL SELECT * FROM ${stats_db_name}.software_languages UNION ALL SELECT * FROM ${stats_db_name}.dataset_languages UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; +CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS +SELECT * +FROM ${stats_db_name}.publication_languages +UNION ALL +SELECT * +FROM ${stats_db_name}.software_languages +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_languages +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_languages; -CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS SELECT * FROM ${stats_db_name}.publication_oids UNION ALL SELECT * FROM ${stats_db_name}.software_oids UNION ALL SELECT * FROM ${stats_db_name}.dataset_oids UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; +CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS +SELECT * +FROM ${stats_db_name}.publication_oids +UNION ALL +SELECT * +FROM ${stats_db_name}.software_oids +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_oids +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_oids; -CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * FROM ${stats_db_name}.publication_pids UNION ALL SELECT * FROM ${stats_db_name}.software_pids UNION ALL SELECT * FROM ${stats_db_name}.dataset_pids UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; +CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS +SELECT * +FROM ${stats_db_name}.publication_pids +UNION ALL +SELECT * +FROM ${stats_db_name}.software_pids +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_pids +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_pids; -CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * FROM ${stats_db_name}.publication_topics UNION ALL SELECT * FROM ${stats_db_name}.software_topics UNION ALL SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS +SELECT * +FROM ${stats_db_name}.publication_topics +UNION ALL +SELECT * +FROM ${stats_db_name}.software_topics +UNION ALL +SELECT * +FROM ${stats_db_name}.dataset_topics +UNION ALL +SELECT * +FROM ${stats_db_name}.otherresearchproduct_topics; -DROP TABLE IF EXISTS ${stats_db_name}.result_organization; -CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization' and r.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.result_organization AS +SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +FROM ${openaire_db_name}.relation r +WHERE r.reltype = 'resultOrganization' + and r.datainfo.deletedbyinference = false; -DROP TABLE IF EXISTS ${stats_db_name}.result_projects; -CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id=pr.result JOIN ${stats_db_name}.project_tmp p ON p.id=pr.id; +CREATE TABLE ${stats_db_name}.result_projects AS +select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend +FROM ${stats_db_name}.result r + JOIN ${stats_db_name}.project_results pr ON r.id = pr.result + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; + +-- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 197047c8b..de0fedd7e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -5,54 +5,105 @@ -- Datasource table/view and Datasource related tables/views ------------------------------------------------------------ ------------------------------------------------------------ - --- Datasource table creation & update -------------------------------------- --- Creating and populating temporary datasource table -DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp; -CREATE TABLE ${stats_db_name}.datasource_tmp(`id` string, `name` STRING, `type` STRING, `dateofvalidation` STRING, `yearofvalidation` string, `harvested` BOOLEAN, `piwik_id` INT, `latitude` STRING, `longitude`STRING, `websiteurl` STRING, `compatibility` STRING) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true'); +CREATE TABLE ${stats_db_name}.datasource_tmp +( + `id` string, + `name` STRING, + `type` STRING, + `dateofvalidation` STRING, + `yearofvalidation` string, + `harvested` BOOLEAN, + `piwik_id` INT, + `latitude` STRING, + `longitude` STRING, + `websiteurl` STRING, + `compatibility` STRING +) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- Insert statement that takes into account the piwik_id of the openAIRE graph -INSERT INTO ${stats_db_name}.datasource_tmp -SELECT substr(d1.id, 4) AS id, officialname.value AS name, -datasourcetype.classname AS type, dateofvalidation.value AS dateofvalidation, date_format(d1.dateofvalidation.value,'yyyy') AS yearofvalidation, -FALSE AS harvested, -CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id, -d1.latitude.value AS latitude, d1.longitude.value AS longitude, -d1.websiteurl.value AS websiteurl, d1.openairecompatibility.classid AS compatibility +INSERT INTO ${stats_db_name}.datasource_tmp +SELECT substr(d1.id, 4) AS id, + officialname.value AS name, + datasourcetype.classname AS type, + dateofvalidation.value AS dateofvalidation, + date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation, + FALSE AS harvested, + CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id, + d1.latitude.value AS latitude, + d1.longitude.value AS longitude, + d1.websiteurl.value AS websiteurl, + d1.openairecompatibility.classid AS compatibility FROM ${openaire_db_name}.datasource d1 -LEFT OUTER JOIN -(SELECT id, split(originalidd, '\\:')[1] as piwik_id -FROM ${openaire_db_name}.datasource -LATERAL VIEW EXPLODE(originalid) temp AS originalidd -WHERE originalidd like "piwik:%") AS d2 -ON d1.id = d2.id -WHERE d1.datainfo.deletedbyinference=FALSE; + LEFT OUTER JOIN + (SELECT id, split(originalidd, '\\:')[1] as piwik_id + FROM ${openaire_db_name}.datasource + LATERAL VIEW EXPLODE(originalid) temp AS originalidd + WHERE originalidd like "piwik:%") AS d2 + ON d1.id = d2.id +WHERE d1.datainfo.deletedbyinference = FALSE; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual(dummy CHAR(1)); -INSERT INTO ${stats_db_name}.dual VALUES('X'); -INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`) -SELECT 'other', 'Other', 'Repository', NULL, NULL, false, 0, NULL, NULL, NULL, 'unknown' FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name='Unknown Repository'); +CREATE TABLE ${stats_db_name}.dual +( + dummy CHAR(1) +); +INSERT INTO ${stats_db_name}.dual +VALUES ('X'); +INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, + `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`) +SELECT 'other', + 'Other', + 'Repository', + NULL, + NULL, + false, + 0, + NULL, + NULL, + NULL, + 'unknown' +FROM ${stats_db_name}.dual +WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; -UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name='Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation='-1'; +UPDATE ${stats_db_name}.datasource_tmp +SET name='Other' +WHERE name = 'Unknown Repository'; +UPDATE ${stats_db_name}.datasource_tmp +SET yearofvalidation=null +WHERE yearofvalidation = '-1'; -DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages; -CREATE TABLE ${stats_db_name}.datasource_languages AS SELECT substr(d.id, 4) AS id, langs.languages AS language FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages; +CREATE TABLE ${stats_db_name}.datasource_languages AS +SELECT substr(d.id, 4) AS id, langs.languages AS language +FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages; -DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids; -CREATE TABLE ${stats_db_name}.datasource_oids AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; +CREATE TABLE ${stats_db_name}.datasource_oids AS +SELECT substr(d.id, 4) AS id, oids.ids AS oid +FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; -DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations; -CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization' and r.datainfo.deletedbyinference=false; +CREATE TABLE ${stats_db_name}.datasource_organizations AS +SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization +FROM ${openaire_db_name}.relation r +WHERE r.reltype = 'datasourceOrganization' + and r.datainfo.deletedbyinference = false; -- datasource sources: -- where the datasource info have been collected from. -create table if not exists ${stats_db_name}.datasource_sources AS select substr(d.id,4) as id, substr(cf.key, 4) as datasource from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf where d.datainfo.deletedbyinference=false; - -CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result FROM ${stats_db_name}.result_datasources; +create table if not exists ${stats_db_name}.datasource_sources AS +select substr(d.id, 4) as id, substr(cf.key, 4) as datasource +from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf +where d.datainfo.deletedbyinference = false; +CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS +SELECT datasource AS id, id AS result +FROM ${stats_db_name}.result_datasources; +-- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS; +-- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql index a918e4de4..a1cb46185 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step9.sql @@ -3,10 +3,21 @@ -- Organization table/view and Organization related tables/views ---------------------------------------------------------------- ---------------------------------------------------------------- -DROP TABLE IF EXISTS ${stats_db_name}.organization; -CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.legalshortname.value as legalshortname, o.country.classid as country -FROM ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=FALSE; +CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization AS +SELECT substr(o.id, 4) as id, + o.legalname.value as name, + o.legalshortname.value as legalshortname, + o.country.classid as country +FROM ${openaire_db_name}.organization o +WHERE o.datainfo.deletedbyinference = FALSE; -CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations; +CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS +SELECT organization AS id, id AS datasource +FROM ${stats_db_name}.datasource_organizations; -CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations; +CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS +SELECT id AS project, organization as id +FROM ${stats_db_name}.project_organizations; + +-- ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS; +-- ANALYZE TABLE ${stats_db_name}.organization COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh index 36e74a556..dc19f84b4 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash curl --request GET $1/cache/updateCache - +sleep 20h \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index dcd034166..9c16f149d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -17,6 +17,14 @@ stats_db_shadow_name the name of the shadow schema + + monitor_db_name + the target monitor db name + + + monitor_db_shadow_name + the name of the shadow monitor db + stats_tool_api_url The url of the API of the stats tool. Is used to trigger the cache update. @@ -252,31 +260,6 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} - - - - - - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - stats_db_shadow_name=${stats_db_shadow_name} - - - - - - - - ${jobTracker} - ${nameNode} - impala-shell.sh - ${stats_db_name} - step18.sql - ${wf:appPath()}/scripts/step18.sql - impala-shell.sh - @@ -285,17 +268,31 @@ ${jobTracker} ${nameNode} - impala-shell.sh + finalizedb.sh + ${stats_db_name} ${stats_db_shadow_name} - computeProductionStats.sql - ${wf:appPath()}/scripts/computeProductionStats.sql - impala-shell.sh + finalizedb.sh - + - + + + ${jobTracker} + ${nameNode} + monitor.sh + ${stats_db_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${wf:appPath()}/scripts/step20-createMonitorDB.sql + monitor.sh + + + + + + ${jobTracker} ${nameNode} diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml new file mode 100644 index 000000000..a78f92d41 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -0,0 +1,91 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-usage-raw-data-update + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-usage-raw-data-update + diff --git a/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh new file mode 100755 index 000000000..4465dae21 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagerawdata \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java new file mode 100644 index 000000000..5b2e6804b --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ConnectDB.java @@ -0,0 +1,125 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageStatsDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getUsageStatsDBSchema() { + return ConnectDB.usageStatsDBSchema; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(5); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + return cpds.getConnection(); + + } + +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java new file mode 100644 index 000000000..e0e0d3687 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ExecuteWorkflow.java @@ -0,0 +1,211 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + + static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; + static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; + static String lareferenciaBaseURL; + static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageStatsDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + + static boolean piwikEmptyDirs; + static boolean downloadPiwikLogs; + static boolean processPiwikLogs; + + static Calendar startingLogPeriod; + static Calendar endingLogPeriod; + static int numberOfPiwikIdsToDownload; + static int numberOfSiteIdsToDownload; + + static boolean laReferenciaEmptyDirs; + static boolean downloadLaReferenciaLogs; + static boolean processLaReferenciaLogs; + + static boolean irusCreateTablesEmptyDirs; + static boolean irusDownloadReports; + static boolean irusProcessStats; + static int irusNumberOfOpendoarsToDownload; + + static boolean sarcCreateTablesEmptyDirs; + static boolean sarcDownloadReports; + static boolean sarcProcessStats; + static int sarcNumberOfIssnToDownload; + + static boolean finalizeStats; + + static int numberOfDownloadThreads; + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters + matomoAuthToken = parser.get("matomoAuthToken"); + matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); + irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); + lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); + lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { + recreateDbAndTables = true; + } else { + recreateDbAndTables = false; + } + + if (parser.get("piwikEmptyDirs").toLowerCase().equals("true")) { + piwikEmptyDirs = true; + } else { + piwikEmptyDirs = false; + } + + if (parser.get("downloadPiwikLogs").toLowerCase().equals("true")) { + downloadPiwikLogs = true; + } else { + downloadPiwikLogs = false; + } + + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { + processPiwikLogs = true; + } else { + processPiwikLogs = false; + } + + String startingLogPeriodStr = parser.get("startingLogPeriod"); + Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); + startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); + +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + + numberOfPiwikIdsToDownload = Integer.parseInt(parser.get("numberOfPiwikIdsToDownload")); + numberOfSiteIdsToDownload = Integer.parseInt(parser.get("numberOfSiteIdsToDownload")); + + if (parser.get("laReferenciaEmptyDirs").toLowerCase().equals("true")) { + laReferenciaEmptyDirs = true; + } else { + laReferenciaEmptyDirs = false; + } + + if (parser.get("downloadLaReferenciaLogs").toLowerCase().equals("true")) { + downloadLaReferenciaLogs = true; + } else { + downloadLaReferenciaLogs = false; + } + + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { + processLaReferenciaLogs = true; + } else { + processLaReferenciaLogs = false; + } + + if (parser.get("irusCreateTablesEmptyDirs").toLowerCase().equals("true")) { + irusCreateTablesEmptyDirs = true; + } else { + irusCreateTablesEmptyDirs = false; + } + + if (parser.get("irusDownloadReports").toLowerCase().equals("true")) { + irusDownloadReports = true; + } else { + irusDownloadReports = false; + } + + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { + irusProcessStats = true; + } else { + irusProcessStats = false; + } + irusNumberOfOpendoarsToDownload = Integer.parseInt(parser.get("irusNumberOfOpendoarsToDownload")); + + if (parser.get("sarcCreateTablesEmptyDirs").toLowerCase().equals("true")) { + sarcCreateTablesEmptyDirs = true; + } else { + sarcCreateTablesEmptyDirs = false; + } + + if (parser.get("sarcDownloadReports").toLowerCase().equals("true")) { + sarcDownloadReports = true; + } else { + sarcDownloadReports = false; + } + + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { + sarcProcessStats = true; + } else { + sarcProcessStats = false; + } + sarcNumberOfIssnToDownload = Integer.parseInt(parser.get("sarcNumberOfIssnToDownload")); + + if (parser.get("finalizeStats").toLowerCase().equals("true")) { + finalizeStats = true; + } else { + finalizeStats = false; + } + + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + // usagestatsExport.createdDBWithTablesOnly(); + } + + private static Calendar startingLogPeriodStr(Date date) { + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; + + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java new file mode 100644 index 000000000..7ec5b0fca --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/IrusStats.java @@ -0,0 +1,358 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class IrusStats { + + private String irusUKURL; + + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + + public IrusStats(String irusUKURL) throws Exception { + this.irusUKURL = irusUKURL; + // The following may not be needed - It will be created when JSON tables are created +// createTmpTables(); + } + + public void reCreateLogDirs() throws Exception { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.delete(new Path(ExecuteWorkflow.irusUKReportPath), true); + + logger.info("Creating irusUKReport directory: " + ExecuteWorkflow.irusUKReportPath); + dfs.mkdirs(new Path(ExecuteWorkflow.irusUKReportPath)); + } + + public void createTables() throws Exception { + try { + logger.info("Creating sushilog"); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sushilogtmp_json table"); + String dropSushilogtmpJson = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sushilogtmp_json"; + stmt.executeUpdate(dropSushilogtmpJson); + logger.info("Dropped sushilogtmp_json table"); + + logger.info("Creating irus_sushilogtmp_json table"); + String createSushilogtmpJson = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " Type: STRING,\n" + + " Value: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` ARRAY<\n" + + " struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.irusUKReportPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(createSushilogtmpJson); + logger.info("Created irus_sushilogtmp_json table"); + + logger.info("Dropping irus_sushilogtmp table"); + String dropSushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(dropSushilogtmp); + logger.info("Dropped irus_sushilogtmp table"); + + logger.info("Creating irus_sushilogtmp table"); + String createSushilogtmp = "CREATE TABLE " + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(createSushilogtmp); + logger.info("Created irus_sushilogtmp table"); + + logger.info("Inserting to irus_sushilogtmp table"); + String insertSushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp " + + "SELECT 'IRUS-UK', CONCAT('opendoar____::', split(split(INPUT__FILE__NAME,'IrusIRReport_')[1],'_')[0]), " + + "`ItemIdent`.`Value`, `ItemPerf`.`Period`.`Begin`, " + + "`ItemPerf`.`Instance`.`MetricType`, `ItemPerf`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "LATERAL VIEW posexplode(ItemPerformance) ItemPerformanceTable AS seqp, ItemPerf " + + "WHERE `ItemIdent`.`Type`= 'OAI'"; + stmt.executeUpdate(insertSushilogtmp); + logger.info("Inserted to irus_sushilogtmp table"); + + logger.info("Inserting to sushilog table"); + String insertToShushilog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sushilog SELECT * FROM " + + ConnectDB.getUsageStatsDBSchema() + + ".irus_sushilogtmp"; + stmt.executeUpdate(insertToShushilog); + logger.info("Inserted to sushilog table"); + + ConnectDB.getHiveConnection().close(); + } + + public void getIrusRRReport(String irusUKReportPath) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusRRReport) Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + logger.info("(getIrusRRReport) Ending period for log download: " + sdf.format(end.getTime())); + + String reportUrl = irusUKURL + "GetReport/?Report=RR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + sdf.format(start.getTime()) + "&EndDate=" + sdf.format(end.getTime()) + + "&RepositoryIdentifier=&ItemDataType=&NewJiscBand=&Granularity=Monthly&Callback="; + + logger.info("(getIrusRRReport) Getting report: " + reportUrl); + + String text = getJson(reportUrl, "", ""); + + List opendoarsToVisit = new ArrayList(); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray != null) { + int i = 0; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + JSONArray itemIdentifier = (JSONArray) jsonObjectRow.get("ItemIdentifier"); + for (Object identifier : itemIdentifier) { + JSONObject opendoar = (JSONObject) identifier; + if (opendoar.get("Type").toString().equals("OpenDOAR")) { + i++; + opendoarsToVisit.add(opendoar.get("Value").toString()); + break; + } + } + // break; + } + + logger.info("(getIrusRRReport) Found the following opendoars for download: " + opendoarsToVisit); + + if (ExecuteWorkflow.irusNumberOfOpendoarsToDownload > 0 + && ExecuteWorkflow.irusNumberOfOpendoarsToDownload <= opendoarsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + opendoarsToVisit = opendoarsToVisit.subList(0, ExecuteWorkflow.irusNumberOfOpendoarsToDownload); + } + + logger.info("(getIrusRRReport) Downloading the followins opendoars: " + opendoarsToVisit); + + for (String opendoar : opendoarsToVisit) { + logger.info("Now working on openDoar: " + opendoar); + this.getIrusIRReport(opendoar, irusUKReportPath); + } + logger.info("(getIrusRRReport) Finished with report: " + reportUrl); + } else { + logger.info("IRUS Reports not found for day"); + } + + } + + private void getIrusIRReport(String opendoar, String irusUKReportPath) throws Exception { + + logger.info("(getIrusIRReport) Getting report(s) with opendoar: " + opendoar); + + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getIrusIRReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("(getIrusIRReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, "opendoar____::" + opendoar); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + int batch_size = 0; + + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + opendoar); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + logger.info("Downloading for date: " + simpleDateFormat.format(start.getTime())); + String reportUrl = this.irusUKURL + "GetReport/?Report=IR1&Release=4&RequestorID=OpenAIRE&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()) + + "&RepositoryIdentifier=opendoar%3A" + opendoar + + "&ItemIdentifier=&ItemDataType=&hasDOI=&Granularity=Monthly&Callback="; + start.add(Calendar.MONTH, 1); + + logger.info("Downloading file: " + reportUrl); + String text = getJson(reportUrl, "", ""); + if (text == null) { + continue; + } + + FileSystem fs = FileSystem.get(new Configuration()); + String filePath = irusUKReportPath + "/" + "IrusIRReport_" + + opendoar + "_" + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePath); + FSDataOutputStream fin = fs.create(new Path(filePath), true); + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(text); + jsonObject = (JSONObject) jsonObject.get("ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Report"); + jsonObject = (JSONObject) jsonObject.get("Customer"); + JSONArray jsonArray = (JSONArray) jsonObject.get("ReportItems"); + if (jsonArray == null) { + continue; + } + String oai = ""; + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + fin.write(jsonObjectRow.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + fin.close(); + } + + } + // ConnectDB.getHiveConnection().close(); + + logger.info("(getIrusIRReport) Finished downloading report(s) with opendoar: " + opendoar); + } + + private String getJson(String url) throws Exception { + try { + System.out.println("===> Connecting to: " + url); + URL website = new URL(url); + System.out.println("Connection url -----> " + url); + URLConnection connection = website.openConnection(); + + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + System.out.println("response ====> " + response.toString()); + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + private String getJson(String url, String username, String password) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL", e); + return null; + } + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java new file mode 100644 index 000000000..904290af8 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaDownloadLogs.java @@ -0,0 +1,273 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + private final String ApimethodGetAllSites = "?module=API&method=SitesManager.getSitesWithViewAccess"; + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaDownloadLogs.class); + + public LaReferenciaDownloadLogs(String piwikUrl, String tokenAuth) throws Exception { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + this.createTables(); +// this.createTmpTables(); + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); +// String sqlcreateRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialog " +// + " WHERE (EXISTS ( SELECT lareferencialog.matomoid, lareferencialog.source, lareferencialog.id_visit," +// + "lareferencialog.action, lareferencialog.\"timestamp\", lareferencialog.entity_id " +// + "FROM lareferencialog " +// + "WHERE lareferencialog.matomoid=new.matomoid AND lareferencialog.source = new.source AND lareferencialog.id_visit = new.id_visit AND lareferencialog.action = new.action AND lareferencialog.entity_id = new.entity_id AND lareferencialog.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// String sqlCreateRuleIndexLaReferenciaLog = "create index if not exists lareferencialog_rule on lareferencialog(matomoid, source, id_visit, action, entity_id, \"timestamp\");"; +// stmt.executeUpdate(sqlcreateRuleLaReferenciaLog); +// stmt.executeUpdate(sqlCreateRuleIndexLaReferenciaLog); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + +// private void createTmpTables() throws Exception { +// +// try { +// Statement stmt = ConnectDB.getConnection().createStatement(); +// String sqlCreateTmpTableLaReferenciaLog = "CREATE TABLE IF NOT EXISTS lareferencialogtmp(matomoid INTEGER, source TEXT, id_visit TEXT, country TEXT, action TEXT, url TEXT, entity_id TEXT, source_item_type TEXT, timestamp TEXT, referrer_name TEXT, agent TEXT, PRIMARY KEY(source, id_visit, action, timestamp, entity_id));"; +// String sqlcreateTmpRuleLaReferenciaLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " +// + " ON INSERT TO lareferencialogtmp " +// + " WHERE (EXISTS ( SELECT lareferencialogtmp.matomoid, lareferencialogtmp.source, lareferencialogtmp.id_visit," +// + "lareferencialogtmp.action, lareferencialogtmp.\"timestamp\", lareferencialogtmp.entity_id " +// + "FROM lareferencialogtmp " +// + "WHERE lareferencialogtmp.matomoid=new.matomoid AND lareferencialogtmp.source = new.source AND lareferencialogtmp.id_visit = new.id_visit AND lareferencialogtmp.action = new.action AND lareferencialogtmp.entity_id = new.entity_id AND lareferencialogtmp.\"timestamp\" = new.\"timestamp\")) DO INSTEAD NOTHING;"; +// stmt.executeUpdate(sqlCreateTmpTableLaReferenciaLog); +// stmt.executeUpdate(sqlcreateTmpRuleLaReferenciaLog); +// +// stmt.close(); +// log.info("Lareferencia Tmp Tables Created"); +// +// } catch (Exception e) { +// log.error("Failed to create tmptables: " + e); +// throw new Exception("Failed to create tmp tables: " + e.toString(), e); +// // System.exit(0); +// } +// } + private String getPiwikLogUrl() { + return piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); +// response.append("\n"); + } + } + + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + e); + throw new Exception("Failed to get URL: " + e.toString(), e); + } + } + + public void GetLaReferenciaRepos(String repoLogsPath) throws Exception { + + String baseApiUrl = getPiwikLogUrl() + ApimethodGetAllSites + format + "&token_auth=" + this.tokenAuth; + String content = ""; + + List siteIdsToVisit = new ArrayList(); + + // Getting all the siteIds in a list for logging reasons & limiting the list + // to the max number of siteIds + content = getJson(baseApiUrl); + JSONParser parser = new JSONParser(); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + siteIdsToVisit.add(Integer.parseInt(jsonObjectRow.get("idsite").toString())); + } + logger.info("Found the following siteIds for download: " + siteIdsToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= siteIdsToVisit.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + siteIdsToVisit = siteIdsToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins siteIds: " + siteIdsToVisit); + + for (int siteId : siteIdsToVisit) { + logger.info("Now working on LaReferencia MatomoId: " + siteId); + this.GetLaReFerenciaLogs(repoLogsPath, siteId); + } + } + + public void GetLaReFerenciaLogs(String repoLogsPath, + int laReferencialMatomoID) throws Exception { + + logger.info("Downloading logs for LaReferencia repoid " + laReferencialMatomoID); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog WHERE matomoid=?"); + st.setInt(1, laReferencialMatomoID); + Date dateMax = null; + + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + Date date = currDay.getTime(); + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger + .info( + "Date found in logs " + dateMax + " and not downloanding Matomo logs for " + + laReferencialMatomoID); + } else { + logger + .info( + "Downloading logs for LaReferencia repoid " + laReferencialMatomoID + " and for " + + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + outFolder = repoLogsPath; + + FileSystem fs = FileSystem.get(new Configuration()); + FSDataOutputStream fin = fs + .create( + new Path( + outFolder + "/" + laReferencialMatomoID + "_LaRefPiwiklog" + sdf.format((date)) + ".json"), + true); + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + laReferencialMatomoID + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + int i = 0; + + JSONParser parser = new JSONParser(); + do { + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + fin.write(jsonObjectRaw.toJSONString().getBytes()); + fin.writeChar('\n'); + } + + logger + .info( + "Downloaded part " + i + " of logs for LaReferencia repoid " + laReferencialMatomoID + + " and for " + + sdf.format(date)); + i++; + } while (true); + fin.close(); + } + } + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java new file mode 100644 index 000000000..bcf1711cb --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/LaReferenciaStats.java @@ -0,0 +1,291 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaStats { + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); + + private String logRepoPath; + + private Statement stmt = null; + + private String CounterRobotsURL; + private ArrayList robotsList; + + public LaReferenciaStats(String logRepoPath) throws Exception { + this.logRepoPath = logRepoPath; + this.createTables(); +// this.createTmpTables(); + } + + /* + * private void connectDB() throws Exception { try { ConnectDB connectDB = new ConnectDB(); } catch (Exception e) { + * log.error("Connect to db failed: " + e); throw new Exception("Failed to connect to db: " + e.toString(), e); } } + */ + private void createTables() throws Exception { + try { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Lareferencia Tables Created"); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + // System.exit(0); + } + } + + public void processLogs() throws Exception { + try { + logger.info("Processing LaReferencia repository logs"); + processlaReferenciaLog(); + logger.info("LaReferencia repository logs process done"); + + logger.info("LaReferencia removing double clicks"); + removeDoubleClicks(); + logger.info("LaReferencia removed double clicks"); + + logger.info("LaReferencia updating Production Tables"); + updateProdTables(); + logger.info("LaReferencia updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processlaReferenciaLog() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping lareferencialogtmp_json table"); + String drop_lareferencialogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json"; + stmt.executeUpdate(drop_lareferencialogtmp_json); + logger.info("Dropped lareferencialogtmp_json table"); + + logger.info("Creating lareferencialogtmp_json"); + String create_lareferencialogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `repItem` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " timestamp: STRING,\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >,\n" + + " `2`: struct<\n" + + " `customVariablePageValue2`: STRING\n" + + " >\n" + + " >\n" + + " >\n" + + " >" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.lareferenciaLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_lareferencialogtmp_json); + logger.info("Created lareferencialogtmp_json"); + + logger.info("Dropping lareferencialogtmp table"); + String drop_lareferencialogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialogtmp"; + stmt.executeUpdate(drop_lareferencialogtmp); + logger.info("Dropped lareferencialogtmp table"); + + logger.info("Creating lareferencialogtmp"); + String create_lareferencialogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_lareferencialogtmp); + logger.info("Created lareferencialogtmp"); + + logger.info("Inserting into lareferencialogtmp"); + String insert_lareferencialogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp " + + "SELECT DISTINCT cast(idSite as INT) as matomoid, CONCAT('opendoar____::', " + + "actiondetail.customVariables.`2`.customVariablePageValue2) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_lareferencialogtmp); + logger.info("Inserted into lareferencialogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp WHERE EXISTS (" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp p2 " + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id " + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp " + + "AND p1.timestamp listHdfsDir(String dir) throws Exception { + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + // log.info("Found hdfs file " + fileName); + fileNames.add(fileName); + } + // hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logRepoPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logRepoPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java new file mode 100644 index 000000000..a84d6743f --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikDownloadLogs.java @@ -0,0 +1,331 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.Authenticator; +import java.net.URL; +import java.net.URLConnection; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikDownloadLogs { + + private final String piwikUrl; + private Date startDate; + private final String tokenAuth; + + /* + * The Piwik's API method + */ + private final String APImethod = "?module=API&method=Live.getLastVisitsDetails"; + private final String format = "&format=json"; + + private static final Logger logger = LoggerFactory.getLogger(PiwikDownloadLogs.class); + + public PiwikDownloadLogs(String piwikUrl, String tokenAuth) { + this.piwikUrl = piwikUrl; + this.tokenAuth = tokenAuth; + + } + + private String getPiwikLogUrl() { + return "https://" + piwikUrl + "/"; + } + + private String getJson(String url) throws Exception { + try { + logger.debug("Connecting to download the JSON: " + url); + URL website = new URL(url); + URLConnection connection = website.openConnection(); + + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + } + return response.toString(); + } catch (Exception e) { + logger.error("Failed to get URL: " + url + " Exception: " + e); + throw new Exception("Failed to get URL: " + url + " Exception: " + e.toString(), e); + } + } + + class WorkerThread implements Runnable { + + private Calendar currDay; + private int siteId; + private String repoLogsPath; + private String portalLogPath; + private String portalMatomoID; + + public WorkerThread(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws IOException { + this.currDay = (Calendar) currDay.clone(); + this.siteId = new Integer(siteId); + this.repoLogsPath = new String(repoLogsPath); + this.portalLogPath = new String(portalLogPath); + this.portalMatomoID = new String(portalMatomoID); + } + + public void run() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + System.out + .println( + Thread.currentThread().getName() + " (Start) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + try { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + System.out + .println( + Thread.currentThread().getName() + " (End) Thread for " + + "parameters: currDay=" + sdf.format(currDay.getTime()) + ", siteId=" + siteId + + ", repoLogsPath=" + repoLogsPath + ", portalLogPath=" + portalLogPath + + ", portalLogPath=" + portalLogPath + ", portalMatomoID=" + portalMatomoID); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } + } + + public void GetOpenAIRELogs(String repoLogsPath, String portalLogPath, String portalMatomoID) throws Exception { + + Statement statement = ConnectDB.getHiveConnection().createStatement(); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + ResultSet rs = statement + .executeQuery( + "SELECT distinct piwik_id from " + ConnectDB.getStatsDBSchema() + + ".datasource where piwik_id is not null and piwik_id <> 0 order by piwik_id"); + + // Getting all the piwikids in a list for logging reasons & limitting the list + // to the max number of piwikids + List piwikIdToVisit = new ArrayList(); + while (rs.next()) { + piwikIdToVisit.add(rs.getInt(1)); + } + logger.info("Found the following piwikIds for download: " + piwikIdToVisit); + + if (ExecuteWorkflow.numberOfPiwikIdsToDownload > 0 + && ExecuteWorkflow.numberOfPiwikIdsToDownload <= piwikIdToVisit.size()) { + logger.info("Trimming piwikIds list to the size of: " + ExecuteWorkflow.numberOfPiwikIdsToDownload); + piwikIdToVisit = piwikIdToVisit.subList(0, ExecuteWorkflow.numberOfPiwikIdsToDownload); + } + + logger.info("Downloading from repos with the followins piwikIds: " + piwikIdToVisit); + + // ExecutorService executor = Executors.newFixedThreadPool(ExecuteWorkflow.numberOfDownloadThreads); + for (int siteId : piwikIdToVisit) { + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("Starting period for log download: " + sdf.format(start.getTime())); + + // Setting the ending period (last day of the month) + // Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + // end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + logger.info("Ending period for log download: " + sdf.format(end.getTime())); + + logger.info("Now working on piwikId: " + siteId); + + PreparedStatement st = ConnectDB.DB_HIVE_CONNECTION + .prepareStatement( + "SELECT max(timestamp) FROM " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog WHERE source=?"); + st.setInt(1, siteId); + Date dateMax = null; + ResultSet rs_date = st.executeQuery(); + while (rs_date.next()) { + logger.info("Found max date: " + rs_date.getString(1) + " for repository " + siteId); + + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + for (Calendar currDay = (Calendar) start.clone(); currDay.before(end); currDay.add(Calendar.DATE, 1)) { + // logger.info("Date used " + currDay.toString()); + // Runnable worker = new WorkerThread(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + // executor.execute(worker);// calling execute method of ExecutorService + logger.info("Date used " + currDay.getTime().toString()); + + if (dateMax != null && currDay.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding Matomo logs for " + siteId); + } else { + GetOpenAIRELogsForDate(currDay, siteId, repoLogsPath, portalLogPath, portalMatomoID); + } + + } + } + // executor.shutdown(); + // while (!executor.isTerminated()) { + // } + // System.out.println("Finished all threads"); + } + + public void GetOpenAIRELogsForDate(Calendar currDay, int siteId, String repoLogsPath, String portalLogPath, + String portalMatomoID) throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + + Date date = currDay.getTime(); + logger.info("Downloading logs for repoid " + siteId + " and for " + sdf.format(date)); + + String period = "&period=day&date=" + sdf.format(date); + String outFolder = ""; + if (siteId == Integer.parseInt(portalMatomoID)) { + outFolder = portalLogPath; + } else { + outFolder = repoLogsPath; + } + + String baseApiUrl = getPiwikLogUrl() + APImethod + "&idSite=" + siteId + period + format + + "&expanded=5&filter_limit=1000&token_auth=" + tokenAuth; + String content = ""; + + int i = 0; + + JSONParser parser = new JSONParser(); + StringBuffer totalContent = new StringBuffer(); + FileSystem fs = FileSystem.get(new Configuration()); + + do { + int writtenBytes = 0; + String apiUrl = baseApiUrl; + + if (i > 0) { + apiUrl += "&filter_offset=" + (i * 1000); + } + + content = getJson(apiUrl); + if (content.length() == 0 || content.equals("[]")) { + break; + } + + FSDataOutputStream fin = fs + .create( + new Path(outFolder + "/" + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"), + true); + JSONArray jsonArray = (JSONArray) parser.parse(content); + for (Object aJsonArray : jsonArray) { + JSONObject jsonObjectRaw = (JSONObject) aJsonArray; + byte[] jsonObjectRawBytes = jsonObjectRaw.toJSONString().getBytes(); + fin.write(jsonObjectRawBytes); + fin.writeChar('\n'); + + writtenBytes += jsonObjectRawBytes.length + 1; + } + + fin.close(); + System.out + .println( + Thread.currentThread().getName() + " (Finished writing) Wrote " + writtenBytes + + " bytes. Filename: " + siteId + "_Piwiklog" + sdf.format((date)) + "_offset_" + i + + ".json"); + + i++; + } while (true); + + fs.close(); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java new file mode 100644 index 000000000..9144620b7 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/PiwikStatsDB.java @@ -0,0 +1,835 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB { + + private String logPath; + private String logRepoPath; + private String logPortalPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + private String CounterRobotsURL; + private ArrayList robotsList; + + public PiwikStatsDB(String logRepoPath, String logPortalPath) throws Exception { + this.logRepoPath = logRepoPath; + this.logPortalPath = logPortalPath; + + } + + public void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + this.createTables(); + // The piwiklog table is not needed since it is built + // on top of JSON files + //////////// this.createTmpTables(); + } + + public ArrayList getRobotsList() { + return robotsList; + } + + public void setRobotsList(ArrayList robotsList) { + this.robotsList = robotsList; + } + + public String getCounterRobotsURL() { + return CounterRobotsURL; + } + + public void setCounterRobotsURL(String CounterRobotsURL) { + this.CounterRobotsURL = CounterRobotsURL; + } + + private void createDatabase() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; + stmt.executeUpdate(dropDatabase); + + } catch (Exception e) { + logger.error("Failed to drop database: " + e); + throw new Exception("Failed to drop database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + private void createTables() throws Exception { + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + // Create Piwiklog table - This table should exist + String sqlCreateTablePiwikLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) " + + "into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePiwikLog); + +// String dropT = "TRUNCATE TABLE " +// + ConnectDB.getUsageStatsDBSchema() +// + ".piwiklog "; +// stmt.executeUpdate(dropT); +// logger.info("truncated piwiklog"); + + ///////////////////////////////////////// + // Rule for duplicate inserts @ piwiklog + ///////////////////////////////////////// + String sqlCreateTablePortalLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log(source INT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTablePortalLog); + + ////////////////////////////////////////////////// + // Rule for duplicate inserts @ process_portal_log + ////////////////////////////////////////////////// + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void processLogs() throws Exception { + try { + ReadCounterRobotsList counterRobots = new ReadCounterRobotsList(this.getCounterRobotsURL()); + this.robotsList = counterRobots.getRobotsPatterns(); + + logger.info("Processing repository logs"); + processRepositoryLog(); + logger.info("Repository logs process done"); + + logger.info("Removing double clicks"); + removeDoubleClicks(); + logger.info("Removing double clicks done"); + + logger.info("Cleaning oai"); + cleanOAI(); + logger.info("Cleaning oai done"); + + logger.info("Processing portal logs"); + processPortalLog(); + logger.info("Portal logs process done"); + + logger.info("Processing portal usagestats"); + portalLogs(); + logger.info("Portal usagestats process done"); + + logger.info("Updating Production Tables"); + updateProdTables(); + logger.info("Updated Production Tables"); + + logger.info("Create Pedocs Tables"); + createPedocsOldUsageData(); + logger.info("Pedocs Tables Created"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void processRepositoryLog() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping piwiklogtmp_json table"); + String drop_piwiklogtmp_json = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json"; + stmt.executeUpdate(drop_piwiklogtmp_json); + logger.info("Dropped piwiklogtmp_json table"); + + logger.info("Creating piwiklogtmp_json"); + String create_piwiklogtmp_json = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp_json(\n" + + " `idSite` STRING,\n" + + " `idVisit` STRING,\n" + + " `country` STRING,\n" + + " `referrerName` STRING,\n" + + " `browser` STRING,\n" + + " `actionDetails` ARRAY<\n" + + " struct<\n" + + " type: STRING,\n" + + " url: STRING,\n" + + " `customVariables`: struct<\n" + + " `1`: struct<\n" + + " `customVariablePageValue1`: STRING\n" + + " >\n" + + " >,\n" + + " timestamp: String\n" + + " >\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.repoLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_piwiklogtmp_json); + logger.info("Created piwiklogtmp_json"); + + logger.info("Dropping piwiklogtmp table"); + String drop_piwiklogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp"; + stmt.executeUpdate(drop_piwiklogtmp); + logger.info("Dropped piwiklogtmp"); + + logger.info("Creating piwiklogtmp"); + String create_piwiklogtmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".piwiklogtmp (source BIGINT, id_Visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_piwiklogtmp); + logger.info("Created piwiklogtmp"); + + logger.info("Inserting into piwiklogtmp"); + String insert_piwiklogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, " + + "actiondetail.type as action, actiondetail.url as url, " + + "actiondetail.customVariables.`1`.`customVariablePageValue1` as entity_id, " + + "'repItem' as source_item_type, from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, " + + "referrerName as referrer_name, browser as agent\n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json\n" + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_piwiklogtmp); + logger.info("Inserted into piwiklogtmp"); + + stmt.close(); + } + + public void removeDoubleClicks() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning download double clicks"); + // clean download double clicks + String sql = "DELETE from " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "WHERE EXISTS (\n" + + "SELECT DISTINCT p1.source, p1.id_visit, p1.action, p1.entity_id, p1.timestamp \n" + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p1, " + + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp p2\n" + + "WHERE p1.source=p2.source AND p1.id_visit=p2.id_visit AND p1.entity_id=p2.entity_id \n" + + "AND p1.action=p2.action AND p1.action='download' AND p1.timestamp!=p2.timestamp \n" + + "AND p1.timestamp\n" + + " >\n" + + ")\n" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + ExecuteWorkflow.portalLogPath + "'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_process_portal_log_tmp_json); + logger.info("Created process_portal_log_tmp_json"); + + logger.info("Droping process_portal_log_tmp table"); + String drop_process_portal_log_tmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp"; + stmt.executeUpdate(drop_process_portal_log_tmp); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Creating process_portal_log_tmp"); + String create_process_portal_log_tmp = "CREATE TABLE " + + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp (source BIGINT, id_visit STRING, country STRING, action STRING, url STRING, " + + "entity_id STRING, source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, timestamp) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(create_process_portal_log_tmp); + logger.info("Created process_portal_log_tmp"); + + logger.info("Inserting into process_portal_log_tmp"); + String insert_process_portal_log_tmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".process_portal_log_tmp " + + "SELECT DISTINCT cast(idSite as BIGINT) as source, idVisit as id_Visit, country, actiondetail.type as action, " + + "actiondetail.url as url, " + + "CASE\n" + + " WHEN (actiondetail.url like '%datasourceId=%') THEN split(actiondetail.url,'datasourceId=')[1] " + + " WHEN (actiondetail.url like '%datasource=%') THEN split(actiondetail.url,'datasource=')[1] " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN split(actiondetail.url,'datasourceFilter=')[1] " + + " WHEN (actiondetail.url like '%articleId=%') THEN split(actiondetail.url,'articleId=')[1] " + + " WHEN (actiondetail.url like '%datasetId=%') THEN split(actiondetail.url,'datasetId=')[1] " + + " WHEN (actiondetail.url like '%projectId=%') THEN split(actiondetail.url,'projectId=')[1] " + + " WHEN (actiondetail.url like '%organizationId=%') THEN split(actiondetail.url,'organizationId=')[1] " + + " ELSE '' " + + "END AS entity_id, " + + "CASE " + + " WHEN (actiondetail.url like '%datasourceId=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasource=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%datasourceFilter=%') THEN 'datasource' " + + " WHEN (actiondetail.url like '%articleId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%datasetId=%') THEN 'result' " + + " WHEN (actiondetail.url like '%projectId=%') THEN 'project' " + + " WHEN (actiondetail.url like '%organizationId=%') THEN 'organization' " + + " ELSE '' " + + "END AS source_item_type, " + + "from_unixtime(cast(actiondetail.timestamp as BIGINT)) as timestamp, referrerName as referrer_name, " + + "browser as agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json " + + "LATERAL VIEW explode(actiondetails) actiondetailsTable AS actiondetail"; + stmt.executeUpdate(insert_process_portal_log_tmp); + logger.info("Inserted into process_portal_log_tmp"); + + stmt.close(); + } + + public void portalLogs() throws SQLException { + Connection con = ConnectDB.getHiveConnection(); + Statement stmt = con.createStatement(); + con.setAutoCommit(false); + + logger.info("PortalStats - Step 1"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'oaItem', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".result_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("PortalStats - Step 2"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'datasource', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".datasource_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + /* + * logger.info("PortalStats - Step 3"); stmt = con.createStatement(); sql = "INSERT INTO " + + * ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + * "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'organization', `timestamp`, referrer_name, agent " + * + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + * "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + * "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + * ".organization_oids roid WHERE roid.id IS NOT NULL)"; // stmt.executeUpdate(sql); stmt.close(); + */ + logger.info("PortalStats - Step 3"); + stmt = con.createStatement(); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SELECT DISTINCT source, id_visit, country, action, url, entity_id, 'project', `timestamp`, referrer_name, agent " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp " + + "WHERE process_portal_log_tmp.entity_id IS NOT NULL AND process_portal_log_tmp.entity_id " + + "IN (SELECT roid.id FROM " + ConnectDB.getStatsDBSchema() + + ".project_oids roid WHERE roid.id IS NOT NULL)"; + stmt.executeUpdate(sql); + stmt.close(); + + con.close(); + } + + private void cleanOAI() throws Exception { + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Cleaning oai - Step 1"); + stmt = ConnectDB.getHiveConnection().createStatement(); + String sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chlc.min-saude.pt/'," + + "'oai:repositorio.chlc.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.chlc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 2"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hospitaldebraga.pt/'," + + "'oai:repositorio.hospitaldebraga.pt:') WHERE entity_id LIKE 'oai:repositorio.hospitaldebraga.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 3"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipl.pt/'," + + "'oai:repositorio.ipl.pt:') WHERE entity_id LIKE 'oai:repositorio.ipl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 4"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bibliotecadigital.ipb.pt/'," + + "'oai:bibliotecadigital.ipb.pt:') WHERE entity_id LIKE 'oai:bibliotecadigital.ipb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 5"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ismai.pt/'," + + "'oai:repositorio.ismai.pt:') WHERE entity_id LIKE 'oai:repositorio.ismai.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 6"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorioaberto.uab.pt/'," + + "'oai:repositorioaberto.uab.pt:') WHERE entity_id LIKE 'oai:repositorioaberto.uab.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 7"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.uac.pt/'," + + "'oai:repositorio.uac.pt:') WHERE entity_id LIKE 'oai:repositorio.uac.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 8"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.insa.pt/'," + + "'oai:repositorio.insa.pt:') WHERE entity_id LIKE 'oai:repositorio.insa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 9"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipcb.pt/'," + + "'oai:repositorio.ipcb.pt:') WHERE entity_id LIKE 'oai:repositorio.ipcb.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 10"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ispa.pt/'," + + "'oai:repositorio.ispa.pt:') WHERE entity_id LIKE 'oai:repositorio.ispa.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 11"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.chporto.pt/'," + + "'oai:repositorio.chporto.pt:') WHERE entity_id LIKE 'oai:repositorio.chporto.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 12"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ucp.pt/'," + + "'oai:repositorio.ucp.pt:') WHERE entity_id LIKE 'oai:repositorio.ucp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 13"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:rihuc.huc.min-saude.pt/'," + + "'oai:rihuc.huc.min-saude.pt:') WHERE entity_id LIKE 'oai:rihuc.huc.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 14"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipv.pt/'," + + "'oai:repositorio.ipv.pt:') WHERE entity_id LIKE 'oai:repositorio.ipv.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 15"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:www.repository.utl.pt/'," + + "'oai:www.repository.utl.pt:') WHERE entity_id LIKE 'oai:www.repository.utl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 16"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:run.unl.pt/'," + + "'oai:run.unl.pt:') WHERE entity_id LIKE 'oai:run.unl.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 17"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:sapientia.ualg.pt/'," + + "'oai:sapientia.ualg.pt:') WHERE entity_id LIKE 'oai:sapientia.ualg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 18"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ipsantarem.pt/'," + + "'oai:repositorio.ipsantarem.pt:') WHERE entity_id LIKE 'oai:repositorio.ipsantarem.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 19"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:arca.igc.gulbenkian.pt/'," + + "'oai:arca.igc.gulbenkian.pt:') WHERE entity_id LIKE 'oai:arca.igc.gulbenkian.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 20"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:ubibliorum.ubi.pt/'," + + "'oai:ubibliorum.ubi.pt:') WHERE entity_id LIKE 'oai:ubibliorum.ubi.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 21"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:digituma.uma.pt/'," + + "'oai:digituma.uma.pt:') WHERE entity_id LIKE 'oai:digituma.uma.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 22"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.ul.pt/'," + + "'oai:repositorio.ul.pt:') WHERE entity_id LIKE 'oai:repositorio.ul.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 23"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.hff.min-saude.pt/'," + + "'oai:repositorio.hff.min-saude.pt:') WHERE entity_id LIKE 'oai:repositorio.hff.min-saude.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 24"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorium.sdum.uminho.pt/'," + + "'oai:repositorium.sdum.uminho.pt:') WHERE entity_id LIKE 'oai:repositorium.sdum.uminho.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 25"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:recipp.ipp.pt/'," + + "'oai:recipp.ipp.pt:') WHERE entity_id LIKE 'oai:recipp.ipp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 26"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:bdigital.ufp.pt/'," + + "'oai:bdigital.ufp.pt:') WHERE entity_id LIKE 'oai:bdigital.ufp.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 27"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:repositorio.lneg.pt/'," + + "'oai:repositorio.lneg.pt:') WHERE entity_id LIKE 'oai:repositorio.lneg.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 28"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:iconline.ipleiria.pt/'," + + "'oai:iconline.ipleiria.pt:') WHERE entity_id LIKE 'oai:iconline.ipleiria.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Step 29"); + stmt = ConnectDB.getHiveConnection().createStatement(); + sql = "UPDATE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp " + + "SET entity_id = regexp_replace(entity_id, '^oai:comum.rcaap.pt/'," + + "'oai:comum.rcaap.pt:') WHERE entity_id LIKE 'oai:comum.rcaap.pt/%'"; + stmt.executeUpdate(sql); + stmt.close(); + + logger.info("Cleaning oai - Done, closing connection"); + ConnectDB.getHiveConnection().close(); + } + + private void updateProdTables() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Inserting data to piwiklog"); + String sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".piwiklog " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping piwiklogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp"); + + logger.info("Dropping process_portal_log_tmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + public void finalizeStats() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping piwiklogtmp"); + String sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp"); + + logger.info("Dropping process_portal_log_tmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp"); + + logger.info("Dropping irus_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp"); + + logger.info("Dropping irus_sushilogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".irus_sushilogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped irus_sushilogtmp_json"); + + logger.info("Dropping lareferencialogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".lareferencialogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped lareferencialogtmp_json"); + + logger.info("Dropping piwiklogtmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".piwiklogtmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped piwiklogtmp_json"); + + logger.info("Dropping process_portal_log_tmp_json"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".process_portal_log_tmp_json"; + stmt.executeUpdate(sql); + logger.info("Dropped process_portal_log_tmp_json"); + + logger.info("Dropping sarc_sushilogtmp"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp"); + + logger.info("Dropping sarc_sushilogtmp_json_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_array"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array"); + sql = "DROP TABLE " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(sql); + logger.info("Dropped sarc_sushilogtmp_json_non_array"); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + + } + + private ArrayList listHdfsDir(String dir) throws Exception { + + FileSystem hdfs = FileSystem.get(new Configuration()); + RemoteIterator Files; + ArrayList fileNames = new ArrayList<>(); + + try { + Path exportPath = new Path(hdfs.getUri() + dir); + Files = hdfs.listFiles(exportPath, false); + while (Files.hasNext()) { + String fileName = Files.next().getPath().toString(); + fileNames.add(fileName); + } + + hdfs.close(); + } catch (Exception e) { + logger.error("HDFS file path with exported data does not exist : " + new Path(hdfs.getUri() + logPath)); + throw new Exception("HDFS file path with exported data does not exist : " + logPath, e); + } + + return fileNames; + } + + private String readHDFSFile(String filename) throws Exception { + String result; + try { + + FileSystem fs = FileSystem.get(new Configuration()); + // log.info("reading file : " + filename); + + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(filename)))); + + StringBuilder sb = new StringBuilder(); + String line = br.readLine(); + + while (line != null) { + if (!line.equals("[]")) { + sb.append(line); + } + // sb.append(line); + line = br.readLine(); + } + result = sb.toString().replace("][{\"idSite\"", ",{\"idSite\""); + if (result.equals("")) { + result = "[]"; + } + + // fs.close(); + } catch (Exception e) { + logger.error(e.getMessage()); + throw new Exception(e); + } + + return result; + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } + + public void createPedocsOldUsageData() throws SQLException { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating PeDocs Old Views Table"); + String sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsoldviews as select * from default.pedocsviews"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Views Table created"); + + logger.info("Creating PeDocs Old Downloads Table"); + sql = "Create TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pedocsolddownloads as select * from default.pedocsdownloads"; + stmt.executeUpdate(sql); + logger.info("PeDocs Old Downloads Table created"); + + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java new file mode 100644 index 000000000..6f020daa0 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/ReadCounterRobotsList.java @@ -0,0 +1,54 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; + +import org.json.JSONException; +import org.json.simple.JSONArray; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class ReadCounterRobotsList { + + private ArrayList robotsPatterns = new ArrayList(); + private String COUNTER_ROBOTS_URL; + + public ReadCounterRobotsList(String url) throws IOException, JSONException, ParseException { + COUNTER_ROBOTS_URL = url; + robotsPatterns = readRobotsPartners(COUNTER_ROBOTS_URL); + } + + private ArrayList readRobotsPartners(String url) throws MalformedURLException, IOException, ParseException { + InputStream is = new URL(url).openStream(); + JSONParser parser = new JSONParser(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("ISO-8859-1"))); + JSONArray jsonArray = (JSONArray) parser.parse(reader); + for (Object aJsonArray : jsonArray) { + org.json.simple.JSONObject jsonObjectRow = (org.json.simple.JSONObject) aJsonArray; + robotsPatterns.add(jsonObjectRow.get("pattern").toString().replace("\\", "\\\\")); + } + return robotsPatterns; + } + + public ArrayList getRobotsPatterns() { + return robotsPatterns; + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java new file mode 100644 index 000000000..e85c972f5 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/SarcStats.java @@ -0,0 +1,500 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class SarcStats { + + private Statement stmtHive = null; + private Statement stmtImpala = null; + + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + + public SarcStats() throws Exception { +// createTables(); + } + + private void createTables() throws Exception { + try { + + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void reCreateLogDirs() throws IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathArray), true); + + logger.info("Deleting sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.delete(new Path(ExecuteWorkflow.sarcsReportPathNonArray), true); + + logger.info("Creating sarcsReport (Array) directory: " + ExecuteWorkflow.sarcsReportPathArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathArray)); + + logger.info("Creating sarcsReport (NonArray) directory: " + ExecuteWorkflow.sarcsReportPathNonArray); + dfs.mkdirs(new Path(ExecuteWorkflow.sarcsReportPathNonArray)); + } + + public void processSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Adding JSON Serde jar"); + stmt.executeUpdate("add jar /usr/share/cmf/common_jars/hive-hcatalog-core-1.1.0-cdh5.14.0.jar"); + logger.info("Added JSON Serde jar"); + + logger.info("Dropping sarc_sushilogtmp_json_array table"); + String drop_sarc_sushilogtmp_json_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_array); + logger.info("Dropped sarc_sushilogtmp_json_array table"); + + logger.info("Creating sarc_sushilogtmp_json_array table"); + String create_sarc_sushilogtmp_json_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array(\n" + + " `ItemIdentifier` ARRAY<\n" + + " struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >\n" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_array); + logger.info("Created sarc_sushilogtmp_json_array table"); + + logger.info("Dropping sarc_sushilogtmp_json_non_array table"); + String drop_sarc_sushilogtmp_json_non_array = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(drop_sarc_sushilogtmp_json_non_array); + logger.info("Dropped sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp_json_non_array table"); + String create_sarc_sushilogtmp_json_non_array = "CREATE EXTERNAL TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array (\n" + + " `ItemIdentifier` struct<\n" + + " `Type`: STRING,\n" + + " `Value`: STRING\n" + + " >,\n" + + " `ItemPerformance` struct<\n" + + " `Period`: struct<\n" + + " `Begin`: STRING,\n" + + " `End`: STRING\n" + + " >,\n" + + " `Instance`: struct<\n" + + " `Count`: STRING,\n" + + " `MetricType`: STRING\n" + + " >\n" + + " >" + + ")" + + "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'\n" + + "LOCATION '" + sarcsReportPathNonArray + "/'\n" + + "TBLPROPERTIES (\"transactional\"=\"false\")"; + stmt.executeUpdate(create_sarc_sushilogtmp_json_non_array); + logger.info("Created sarc_sushilogtmp_json_non_array table"); + + logger.info("Creating sarc_sushilogtmp table"); + String create_sarc_sushilogtmp = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp(source STRING, repository STRING, " + + "rid STRING, date STRING, metric_type STRING, count INT) clustered by (source) into 100 buckets stored as orc " + + "tblproperties('transactional'='true')"; + stmt.executeUpdate(create_sarc_sushilogtmp); + logger.info("Created sarc_sushilogtmp table"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + String insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + " `ItemIdent`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_array " + + "LATERAL VIEW posexplode(ItemIdentifier) ItemIdentifierTable AS seqi, ItemIdent " + + "WHERE `ItemIdent`.`Type`='DOI'"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_array)"); + + logger.info("Inserting to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + insert_sarc_sushilogtmp = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp " + + "SELECT 'SARC-OJS', split(split(INPUT__FILE__NAME,'SarcsARReport_')[1],'_')[0], " + + "`ItemIdentifier`.`Value`, `ItemPerformance`.`Period`.`Begin`, " + + "`ItemPerformance`.`Instance`.`MetricType`, `ItemPerformance`.`Instance`.`Count` " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp_json_non_array"; + stmt.executeUpdate(insert_sarc_sushilogtmp); + logger.info("Inserted to sarc_sushilogtmp table (sarc_sushilogtmp_json_non_array)"); + + ConnectDB.getHiveConnection().close(); + } + + public void getAndProcessSarc(String sarcsReportPathArray, String sarcsReportPathNonArray) throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating sushilog table"); + String createSushilog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog " + + "(`source` string, " + + "`repository` string, " + + "`rid` string, " + + "`date` string, " + + "`metric_type` string, " + + "`count` int)"; + stmt.executeUpdate(createSushilog); + logger.info("Created sushilog table"); + + logger.info("Dropping sarc_sushilogtmp table"); + String drop_sarc_sushilogtmp = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".sarc_sushilogtmp"; + stmt.executeUpdate(drop_sarc_sushilogtmp); + logger.info("Dropped sarc_sushilogtmp table"); + ConnectDB.getHiveConnection().close(); + + List issnAndUrls = new ArrayList(); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/motricidade/sushiLite/v1_7/", "1646-107X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/antropologicas/sushiLite/v1_7/", "0873-819X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/interaccoes/sushiLite/v1_7/", "1646-2335" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cct/sushiLite/v1_7/", "2182-3030" + }); + issnAndUrls.add(new String[] { + "https://actapediatrica.spp.pt/sushiLite/v1_7/", "0873-9781" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sociologiapp/sushiLite/v1_7/", "0873-6529" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/finisterra/sushiLite/v1_7/", "0430-5027" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/sisyphus/sushiLite/v1_7/", "2182-8474" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/anestesiologia/sushiLite/v1_7/", "0871-6099" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/rpe/sushiLite/v1_7/", "0871-9187" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/psilogos/sushiLite/v1_7/", "1646-091X" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/juridica/sushiLite/v1_7/", "2183-5799" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/ecr/sushiLite/v1_7/", "1647-2098" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/nascercrescer/sushiLite/v1_7/", "0872-0754" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/cea/sushiLite/v1_7/", "1645-3794" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/proelium/sushiLite/v1_7/", "1645-8826" + }); + issnAndUrls.add(new String[] { + "https://revistas.rcaap.pt/millenium/sushiLite/v1_7/", "0873-3015" + }); + + if (ExecuteWorkflow.sarcNumberOfIssnToDownload > 0 + && ExecuteWorkflow.sarcNumberOfIssnToDownload <= issnAndUrls.size()) { + logger.info("Trimming siteIds list to the size of: " + ExecuteWorkflow.sarcNumberOfIssnToDownload); + issnAndUrls = issnAndUrls.subList(0, ExecuteWorkflow.sarcNumberOfIssnToDownload); + } + + logger.info("(getAndProcessSarc) Downloading the followins opendoars: " + issnAndUrls); + + for (String[] issnAndUrl : issnAndUrls) { + logger.info("Now working on ISSN: " + issnAndUrl[1]); + getARReport(sarcsReportPathArray, sarcsReportPathNonArray, issnAndUrl[0], issnAndUrl[1]); + } + + } + + public void updateSarcLogs() throws Exception { + stmtHive = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + stmtImpala = ConnectDB.getImpalaConnection().createStatement(); + + // Insert into sushilog + logger.info("Inserting into sushilog"); + String insertSushiLog = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog SELECT * " + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_sushilogtmp"; + stmtHive.executeUpdate(insertSushiLog); + logger.info("Inserted into sushilog"); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + } + + public void getARReport(String sarcsReportPathArray, String sarcsReportPathNonArray, + String url, String issn) throws Exception { + logger.info("Processing SARC! issn: " + issn + " with url: " + url); + ConnectDB.getHiveConnection().setAutoCommit(false); + + SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM"); + // Setting the starting period + Calendar start = (Calendar) ExecuteWorkflow.startingLogPeriod.clone(); + logger.info("(getARReport) Starting period for log download: " + simpleDateFormat.format(start.getTime())); + + // Setting the ending period (last day of the month) +// Calendar end = (Calendar) ExecuteWorkflow.endingLogPeriod.clone(); +// end.add(Calendar.MONTH, +1); +// end.add(Calendar.DAY_OF_MONTH, -1); + Calendar end = Calendar.getInstance(); + end.add(Calendar.DAY_OF_MONTH, -1); + + logger.info("(getARReport) Ending period for log download: " + simpleDateFormat.format(end.getTime())); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); + PreparedStatement st = ConnectDB + .getHiveConnection() + .prepareStatement( + "SELECT max(date) FROM " + ConnectDB.getUsageStatsDBSchema() + ".sushilog WHERE repository=?"); + st.setString(1, issn); + ResultSet rs_date = st.executeQuery(); + Date dateMax = null; + while (rs_date.next()) { + if (rs_date.getString(1) != null && !rs_date.getString(1).equals("null") + && !rs_date.getString(1).equals("")) { + start.setTime(sdf.parse(rs_date.getString(1))); + dateMax = sdf.parse(rs_date.getString(1)); + } + } + rs_date.close(); + + // Creating the needed configuration for the correct storing of data + Configuration config = new Configuration(); + config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); + config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); + config + .set( + "fs.hdfs.impl", + org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + config + .set( + "fs.file.impl", + org.apache.hadoop.fs.LocalFileSystem.class.getName()); + FileSystem dfs = FileSystem.get(config); + + if (dateMax != null && end.getTime().compareTo(dateMax) <= 0) { + logger.info("Date found in logs " + dateMax + " and not downloanding logs for " + issn); + } else { + start.add(Calendar.MONTH, 1); + while (start.before(end)) { + String reportUrl = url + "GetReport/?Report=AR1&Format=json&BeginDate=" + + simpleDateFormat.format(start.getTime()) + "&EndDate=" + simpleDateFormat.format(start.getTime()); + start.add(Calendar.MONTH, 1); + + logger.info("(getARReport) Getting report: " + reportUrl); + String text = getJson(reportUrl); + if (text == null) { + continue; + } + + JSONParser parser = new JSONParser(); + JSONObject jsonObject = null; + try { + jsonObject = (JSONObject) parser.parse(text); + } // if there is a parsing error continue with the next url + catch (ParseException pe) { + continue; + } + + jsonObject = (JSONObject) jsonObject.get("sc:ReportResponse"); + jsonObject = (JSONObject) jsonObject.get("sc:Report"); + if (jsonObject == null) { + continue; + } + jsonObject = (JSONObject) jsonObject.get("c:Report"); + jsonObject = (JSONObject) jsonObject.get("c:Customer"); + Object obj = jsonObject.get("c:ReportItems"); + JSONArray jsonArray = new JSONArray(); + if (obj instanceof JSONObject) { + jsonArray.add(obj); + } else { + jsonArray = (JSONArray) obj; + // jsonArray = (JSONArray) jsonObject.get("c:ReportItems"); + } + if (jsonArray == null) { + continue; + } + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathArray = sarcsReportPathArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathArray); + FSDataOutputStream finArray = dfs.create(new Path(filePathArray), true); + + // Creating the file in the filesystem for the ItemIdentifier as array object + String filePathNonArray = sarcsReportPathNonArray + "/SarcsARReport_" + issn + "_" + + simpleDateFormat.format(start.getTime()) + ".json"; + logger.info("Storing to file: " + filePathNonArray); + FSDataOutputStream finNonArray = dfs.create(new Path(filePathNonArray), true); + + for (Object aJsonArray : jsonArray) { + + JSONObject jsonObjectRow = (JSONObject) aJsonArray; + renameKeysRecursively(":", jsonObjectRow); + + if (jsonObjectRow.get("ItemIdentifier") instanceof JSONObject) { + finNonArray.write(jsonObjectRow.toJSONString().getBytes()); + finNonArray.writeChar('\n'); + } else { + finArray.write(jsonObjectRow.toJSONString().getBytes()); + finArray.writeChar('\n'); + } + } + + finArray.close(); + finNonArray.close(); + + // Check the file size and if it is too big, delete it + File fileArray = new File(filePathArray); + if (fileArray.length() == 0) { + fileArray.delete(); + } + File fileNonArray = new File(filePathNonArray); + if (fileNonArray.length() == 0) { + fileNonArray.delete(); + } + + } + + dfs.close(); + } + // ConnectDB.getHiveConnection().close(); + } + + private void renameKeysRecursively(String delimiter, JSONArray givenJsonObj) throws Exception { + for (Object jjval : givenJsonObj) { + if (jjval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jjval); + } else if (jjval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jjval); + } // All other types of vals + else + ; + } + } + + private void renameKeysRecursively(String delimiter, JSONObject givenJsonObj) throws Exception { + Set jkeys = new HashSet(givenJsonObj.keySet()); + for (String jkey : jkeys) { + + String[] splitArray = jkey.split(delimiter); + String newJkey = splitArray[splitArray.length - 1]; + + Object jval = givenJsonObj.get(jkey); + givenJsonObj.remove(jkey); + givenJsonObj.put(newJkey, jval); + + if (jval instanceof JSONObject) { + renameKeysRecursively(delimiter, (JSONObject) jval); + } + + if (jval instanceof JSONArray) { + renameKeysRecursively(delimiter, (JSONArray) jval); + } + } + } + + private String getJson(String url) throws Exception { + // String cred=username+":"+password; + // String encoded = new sun.misc.BASE64Encoder().encode (cred.getBytes()); + try { + URL website = new URL(url); + URLConnection connection = website.openConnection(); + // connection.setRequestProperty ("Authorization", "Basic "+encoded); + StringBuilder response; + try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + response = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + return response.toString(); + } catch (Exception e) { + + // Logging error and silently continuing + logger.error("Failed to get URL: " + e); + System.out.println("Failed to get URL: " + e); +// return null; +// throw new Exception("Failed to get URL: " + e.toString(), e); + } + return ""; + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java new file mode 100644 index 000000000..07e15605f --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/java/eu/dnetlib/oa/graph/usagerawdata/export/UsageStatsExporter.java @@ -0,0 +1,206 @@ + +package eu.dnetlib.oa.graph.usagerawdata.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + private void reCreateLogDirs() throws IllegalArgumentException, IOException { + FileSystem dfs = FileSystem.get(new Configuration()); + + logger.info("Deleting repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.delete(new Path(ExecuteWorkflow.repoLogPath), true); + + logger.info("Deleting portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.delete(new Path(ExecuteWorkflow.portalLogPath), true); + + logger.info("Deleting lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.delete(new Path(ExecuteWorkflow.lareferenciaLogPath), true); + + logger.info("Creating repoLog directory: " + ExecuteWorkflow.repoLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.repoLogPath)); + + logger.info("Creating portalLog directory: " + ExecuteWorkflow.portalLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.portalLogPath)); + + logger.info("Creating lareferenciaLog directory: " + ExecuteWorkflow.lareferenciaLogPath); + dfs.mkdirs(new Path(ExecuteWorkflow.lareferenciaLogPath)); + } + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) { + piwikstatsdb.recreateDBAndTables(); + logger.info("DB-Tables-TmpTables are created "); + } + + logger.info("Initializing the download logs module"); + PiwikDownloadLogs piwd = new PiwikDownloadLogs(ExecuteWorkflow.matomoBaseURL, ExecuteWorkflow.matomoAuthToken); + + if (ExecuteWorkflow.piwikEmptyDirs) { + logger.info("Recreating Piwik log directories"); + piwikstatsdb.reCreateLogDirs(); + } + + // Downloading piwik logs (also managing directory creation) + if (ExecuteWorkflow.downloadPiwikLogs) { + logger.info("Downloading piwik logs"); + piwd + .GetOpenAIRELogs( + ExecuteWorkflow.repoLogPath, + ExecuteWorkflow.portalLogPath, ExecuteWorkflow.portalMatomoID); + } + logger.info("Downloaded piwik logs"); + + // Create DB tables, insert/update statistics + String cRobotsUrl = "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json"; + piwikstatsdb.setCounterRobotsURL(cRobotsUrl); + + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing logs"); + piwikstatsdb.processLogs(); + } + + logger.info("Creating LaReferencia tables"); + LaReferenciaDownloadLogs lrf = new LaReferenciaDownloadLogs(ExecuteWorkflow.lareferenciaBaseURL, + ExecuteWorkflow.lareferenciaAuthToken); + + if (ExecuteWorkflow.laReferenciaEmptyDirs) { + logger.info("Recreating LaReferencia log directories"); + lrf.reCreateLogDirs(); + } + + if (ExecuteWorkflow.downloadLaReferenciaLogs) { + logger.info("Downloading LaReferencia logs"); + lrf.GetLaReferenciaRepos(ExecuteWorkflow.lareferenciaLogPath); + logger.info("Downloaded LaReferencia logs"); + } + + LaReferenciaStats lastats = new LaReferenciaStats(ExecuteWorkflow.lareferenciaLogPath); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } + + IrusStats irusstats = new IrusStats(ExecuteWorkflow.irusUKBaseURL); + if (ExecuteWorkflow.irusCreateTablesEmptyDirs) { + logger.info("Creating Irus Stats tables"); + irusstats.createTables(); + logger.info("Created Irus Stats tables"); + + logger.info("Re-create log dirs"); + irusstats.reCreateLogDirs(); + logger.info("Re-created log dirs"); + } + + if (ExecuteWorkflow.irusDownloadReports) { + irusstats.getIrusRRReport(ExecuteWorkflow.irusUKReportPath); + } + + if (ExecuteWorkflow.irusProcessStats) { + irusstats.processIrusStats(); + logger.info("Irus done"); + } + + SarcStats sarcStats = new SarcStats(); + if (ExecuteWorkflow.sarcCreateTablesEmptyDirs) { + sarcStats.reCreateLogDirs(); + } + if (ExecuteWorkflow.sarcDownloadReports) { + sarcStats.getAndProcessSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + } + + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(ExecuteWorkflow.sarcsReportPathArray, ExecuteWorkflow.sarcsReportPathNonArray); + sarcStats.updateSarcLogs(); + } + logger.info("Sarc done"); + // finalize usagestats + + logger.info("Dropping tmp tables"); + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Dropped tmp tables"); + } + + logger.info("Raw Data Download End"); + } + + public void createdDBWithTablesOnly() throws Exception { + logger.info("Initialising DB properties"); + ConnectDB.init(); + + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(ExecuteWorkflow.repoLogPath, ExecuteWorkflow.portalLogPath); + piwikstatsdb.recreateDBAndTables(); + + piwikstatsdb.createPedocsOldUsageData(); + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating LaReferencia tables"); + String sqlCreateTableLareferenciaLog = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + ".lareferencialog(matomoid INT, " + + "source STRING, id_visit STRING, country STRING, action STRING, url STRING, entity_id STRING, " + + "source_item_type STRING, timestamp STRING, referrer_name STRING, agent STRING) " + + "clustered by (source, id_visit, action, timestamp, entity_id) into 100 buckets " + + "stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableLareferenciaLog); + logger.info("Created LaReferencia tables"); + + logger.info("Creating sushilog"); + + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog(source STRING, " + + "repository STRING, rid STRING, date STRING, metric_type STRING, count INT) clustered by (source, " + + "repository, rid, date, metric_type) into 100 buckets stored as orc tblproperties('transactional'='true')"; + stmt.executeUpdate(sqlCreateTableSushiLog); + logger.info("Created sushilog"); + + logger.info("Updating piwiklog"); + String sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".piwiklog select * from openaire_prod_usage_raw.piwiklog"; + stmt.executeUpdate(sql); + + logger.info("Updating lareferencialog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".lareferencialog select * from openaire_prod_usage_raw.lareferencialog"; + stmt.executeUpdate(sql); + + logger.info("Updating sushilog"); + sql = "insert into " + ConnectDB.getUsageStatsDBSchema() + + ".sushilog select * from openaire_prod_usage_raw.sushilog"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + + } + +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json new file mode 100644 index 000000000..1aa5ad6f8 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/export/usagerawdata_parameters.json @@ -0,0 +1,219 @@ +[ + { + "paramName": "mat", + "paramLongName": "matomoAuthToken", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "mbu", + "paramLongName": "matomoBaseURL", + "paramDescription": "URL of the isLookUp Service", + "paramRequired": true + }, + { + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukbuw", + "paramLongName": "irusUKBaseURL", + "paramDescription": "working directory", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lbu", + "paramLongName": "lareferenciaBaseURL", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "lat", + "paramLongName": "lareferenciaAuthToken", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "pwed", + "paramLongName": "piwikEmptyDirs", + "paramDescription": "Empty piwik directories?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dpwl", + "paramLongName": "downloadPiwikLogs", + "paramDescription": "download piwik logs?", + "paramRequired": true + }, + { + "paramName": "slp", + "paramLongName": "startingLogPeriod", + "paramDescription": "Starting log period", + "paramRequired": true + }, + { + "paramName": "npidd", + "paramLongName": "numberOfPiwikIdsToDownload", + "paramDescription": "Limit the number of the downloaded piwikids to the first numberOfPiwikIdsToDownload", + "paramRequired": true + }, + { + "paramName": "nsidd", + "paramLongName": "numberOfSiteIdsToDownload", + "paramDescription": "Limit the number of the downloaded siteids (La Referencia logs) to the first numberOfSiteIdsToDownload", + "paramRequired": true + }, + { + "paramName": "lerd", + "paramLongName": "laReferenciaEmptyDirs", + "paramDescription": "Empty LaReferencia directories?", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "dlrl", + "paramLongName": "downloadLaReferenciaLogs", + "paramDescription": "download La Referencia logs?", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "irusCreateTablesEmptyDirs", + "paramDescription": "Irus section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "irusDownloadReports", + "paramDescription": "Irus section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "irusNumberOfOpendoarsToDownload", + "paramDescription": "Limit the number of the downloaded Opendoars (Irus) to the first irusNumberOfOpendoarsToDownload", + "paramRequired": true + }, + { + "paramName": "icted", + "paramLongName": "sarcCreateTablesEmptyDirs", + "paramDescription": "Sarc section: Create tables and empty JSON directories?", + "paramRequired": true + }, + { + "paramName": "idr", + "paramLongName": "sarcDownloadReports", + "paramDescription": "Sarc section: Download reports?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "inod", + "paramLongName": "sarcNumberOfIssnToDownload", + "paramDescription": "Limit the number of the downloaded ISSN (Sarc) to the first sarcNumberOfIssnToDownload", + "paramRequired": true + }, + + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml new file mode 100644 index 000000000..022a107ab --- /dev/null +++ b/dhp-workflows/dhp-usage-raw-data-update/src/main/resources/eu/dnetlib/dhp/oa/graph/usagerawdata/oozie_app/workflow.xml @@ -0,0 +1,88 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.usagerawdata.export.ExecuteWorkflow + --matomoAuthToken${matomoAuthToken} + --matomoBaseURL${matomoBaseURL} + --repoLogPath${repoLogPath} + --portalLogPath${portalLogPath} + --portalMatomoID${portalMatomoID} + --irusUKBaseURL${irusUKBaseURL} + --irusUKReportPath${irusUKReportPath} + --sarcsReportPathArray${sarcsReportPathArray} + --sarcsReportPathNonArray${sarcsReportPathNonArray} + --lareferenciaLogPath${lareferenciaLogPath} + --lareferenciaBaseURL${lareferenciaBaseURL} + --lareferenciaAuthToken${lareferenciaAuthToken} + --dbHiveUrl${hiveJdbcUrl} + --dbImpalaUrl${impalaJdbcUrl} + --usageStatsDBSchema${usageStatsDBSchema} + --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} + --piwikEmptyDirs${piwikEmptyDirs} + --downloadPiwikLogs${downloadPiwikLogs} + --processPiwikLogs${processPiwikLogs} + --startingLogPeriod${startingLogPeriod} + --numberOfPiwikIdsToDownload${numberOfPiwikIdsToDownload} + --numberOfSiteIdsToDownload${numberOfSiteIdsToDownload} + --laReferenciaEmptyDirs${laReferenciaEmptyDirs} + --downloadLaReferenciaLogs${downloadLaReferenciaLogs} + --processLaReferenciaLogs${processLaReferenciaLogs} + --irusCreateTablesEmptyDirs${irusCreateTablesEmptyDirs} + --irusDownloadReports${irusDownloadReports} + --irusProcessStats${irusProcessStats} + --irusNumberOfOpendoarsToDownload${irusNumberOfOpendoarsToDownload} + --sarcCreateTablesEmptyDirs${sarcCreateTablesEmptyDirs} + --sarcDownloadReports${sarcDownloadReports} + --sarcProcessStats${sarcProcessStats} + --sarcNumberOfIssnToDownload${sarcNumberOfIssnToDownload} + --finalizeStats${finalizeStats} + --numberOfDownloadThreads${numberOfDownloadThreads} + + + + + + + + diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml new file mode 100644 index 000000000..20d2f5b76 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -0,0 +1,91 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-usage-stats-build + + + + pl.project13.maven + git-commit-id-plugin + 2.1.15 + + + + revision + + + + + ${project.basedir}/../.git + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.1 + + 1.8 + 1.8 + + + + + + UTF-8 + UTF-8 + 0.13.1-cdh5.2.1 + 2.5.0-cdh5.2.1 + + + + + org.apache.spark + spark-core_2.11 + 2.2.0 + + + org.apache.spark + spark-sql_2.11 + 2.4.5 + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.json + json + 20180130 + jar + + + org.apache.hive + hive-jdbc + ${cdh.hive.version} + + + org.apache.hadoop + hadoop-common + ${cdh.hadoop.version} + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + c3p0 + c3p0 + 0.9.1.2 + jar + + + dhp-usage-stats-build + diff --git a/dhp-workflows/dhp-usage-stats-build/runworkflow.sh b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh new file mode 100755 index 000000000..191fb24c6 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/runworkflow.sh @@ -0,0 +1 @@ +mvn clean package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/usagestatsbuild \ No newline at end of file diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java new file mode 100644 index 000000000..e53709f1a --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ConnectDB.java @@ -0,0 +1,147 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.Properties; + +import org.apache.log4j.Logger; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +/** + * @author D. Pierrakos, S. Zoupanos + */ +import com.mchange.v2.c3p0.ComboPooledDataSource; + +public abstract class ConnectDB { + + public static Connection DB_HIVE_CONNECTION; + public static Connection DB_IMPALA_CONNECTION; + + private static String dbHiveUrl; + private static String dbImpalaUrl; + private static String usageRawDataDBSchema; + private static String usageStatsDBSchema; + private static String usagestatsPermanentDBSchema; + private static String statsDBSchema; + private final static Logger log = Logger.getLogger(ConnectDB.class); + + static void init() throws ClassNotFoundException { + + dbHiveUrl = ExecuteWorkflow.dbHiveUrl; + dbImpalaUrl = ExecuteWorkflow.dbImpalaUrl; + usageStatsDBSchema = ExecuteWorkflow.usageStatsDBSchema; + statsDBSchema = ExecuteWorkflow.statsDBSchema; + usageRawDataDBSchema = ExecuteWorkflow.usageRawDataDBSchema; + usagestatsPermanentDBSchema = ExecuteWorkflow.usagestatsPermanentDBSchema; + + Class.forName("org.apache.hive.jdbc.HiveDriver"); + } + + public static Connection getHiveConnection() throws SQLException { + if (DB_HIVE_CONNECTION != null && !DB_HIVE_CONNECTION.isClosed()) { + return DB_HIVE_CONNECTION; + } else { + DB_HIVE_CONNECTION = connectHive(); + + return DB_HIVE_CONNECTION; + } + } + + public static Connection getImpalaConnection() throws SQLException { + if (DB_IMPALA_CONNECTION != null && !DB_IMPALA_CONNECTION.isClosed()) { + return DB_IMPALA_CONNECTION; + } else { + DB_IMPALA_CONNECTION = connectImpala(); + + return DB_IMPALA_CONNECTION; + } + } + + public static String getUsageRawDataDBSchema() { + return ConnectDB.usageRawDataDBSchema; + } + + public static String getUsageStatsDBSchema() { + String datePattern = "YYYYMMdd"; + DateFormat df = new SimpleDateFormat(datePattern); +// Get the today date using Calendar object. + Date today = Calendar.getInstance().getTime(); + String todayAsString = df.format(today); + + return ConnectDB.usageStatsDBSchema + "_" + todayAsString; + } + + public static String getStatsDBSchema() { + return ConnectDB.statsDBSchema; + } + + public static String getUsagestatsPermanentDBSchema() { + return ConnectDB.usagestatsPermanentDBSchema; + } + + private static Connection connectHive() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbHiveUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbHiveUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + return cpds.getConnection(); + + } + + private static Connection connectImpala() throws SQLException { + /* + * Connection connection = DriverManager.getConnection(dbImpalaUrl); Statement stmt = + * connection.createStatement(); log.debug("Opened database successfully"); return connection; + */ + ComboPooledDataSource cpds = new ComboPooledDataSource(); + cpds.setJdbcUrl(dbImpalaUrl); + cpds.setAcquireIncrement(1); + cpds.setMaxPoolSize(100); + cpds.setMinPoolSize(1); + cpds.setInitialPoolSize(1); + cpds.setMaxIdleTime(300); + cpds.setMaxConnectionAge(36000); + + cpds.setAcquireRetryAttempts(30); + cpds.setAcquireRetryDelay(2000); + cpds.setBreakAfterAcquireFailure(false); + + cpds.setCheckoutTimeout(0); + cpds.setPreferredTestQuery("SELECT 1"); + cpds.setIdleConnectionTestPeriod(60); + + return cpds.getConnection(); + + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java new file mode 100644 index 000000000..26e44b1f6 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/ExecuteWorkflow.java @@ -0,0 +1,153 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; + +import org.apache.commons.io.IOUtils; +import org.apache.log4j.BasicConfigurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class ExecuteWorkflow { + +// static String matomoAuthToken; + static String matomoBaseURL; + static String repoLogPath; + static String portalLogPath; + static String portalMatomoID; +// static String irusUKBaseURL; + static String irusUKReportPath; + static String sarcsReportPathArray; + static String sarcsReportPathNonArray; + static String lareferenciaLogPath; +// static String lareferenciaBaseURL; +// static String lareferenciaAuthToken; + static String dbHiveUrl; + static String dbImpalaUrl; + static String usageRawDataDBSchema; + static String usageStatsDBSchema; + static String usagestatsPermanentDBSchema; + static String statsDBSchema; + static boolean recreateDbAndTables; + + static boolean processPiwikLogs; + static boolean processLaReferenciaLogs; + + static boolean irusProcessStats; + + static boolean sarcProcessStats; + + static boolean finalizeStats; + static boolean finalTablesVisibleToImpala; + + static int numberOfDownloadThreads; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + public static void main(String args[]) throws Exception { + + // Sending the logs to the console + BasicConfigurator.configure(); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + UsageStatsExporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json"))); + parser.parseArgument(args); + + // Setting up the initial parameters +// matomoAuthToken = parser.get("matomoAuthToken"); +// matomoBaseURL = parser.get("matomoBaseURL"); + repoLogPath = parser.get("repoLogPath"); + portalLogPath = parser.get("portalLogPath"); + portalMatomoID = parser.get("portalMatomoID"); +// irusUKBaseURL = parser.get("irusUKBaseURL"); + irusUKReportPath = parser.get("irusUKReportPath"); + sarcsReportPathArray = parser.get("sarcsReportPathArray"); + sarcsReportPathNonArray = parser.get("sarcsReportPathNonArray"); + lareferenciaLogPath = parser.get("lareferenciaLogPath"); +// lareferenciaBaseURL = parser.get("lareferenciaBaseURL"); +// lareferenciaAuthToken = parser.get("lareferenciaAuthToken"); + + dbHiveUrl = parser.get("dbHiveUrl"); + dbImpalaUrl = parser.get("dbImpalaUrl"); + usageRawDataDBSchema = parser.get("usageRawDataDBSchema"); + usageStatsDBSchema = parser.get("usageStatsDBSchema"); + usagestatsPermanentDBSchema = parser.get("usagestatsPermanentDBSchema"); + statsDBSchema = parser.get("statsDBSchema"); + + if (parser.get("processPiwikLogs").toLowerCase().equals("true")) { + processPiwikLogs = true; + } else { + processPiwikLogs = false; + } + +// String startingLogPeriodStr = parser.get("startingLogPeriod"); +// Date startingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(startingLogPeriodStr); +// startingLogPeriod = startingLogPeriodStr(startingLogPeriodDate); +// +// String endingLogPeriodStr = parser.get("endingLogPeriod"); +// Date endingLogPeriodDate = new SimpleDateFormat("MM/yyyy").parse(endingLogPeriodStr); +// endingLogPeriod = startingLogPeriodStr(endingLogPeriodDate); + + if (parser.get("recreateDbAndTables").toLowerCase().equals("true")) { + recreateDbAndTables = true; + } else { + recreateDbAndTables = false; + } + + if (parser.get("processLaReferenciaLogs").toLowerCase().equals("true")) { + processLaReferenciaLogs = true; + } else { + processLaReferenciaLogs = false; + } + + if (parser.get("irusProcessStats").toLowerCase().equals("true")) { + irusProcessStats = true; + } else { + irusProcessStats = false; + } + + if (parser.get("sarcProcessStats").toLowerCase().equals("true")) { + sarcProcessStats = true; + } else { + sarcProcessStats = false; + } + + if (parser.get("finalizeStats").toLowerCase().equals("true")) { + finalizeStats = true; + } else { + finalizeStats = false; + } + if (parser.get("finalTablesVisibleToImpala").toLowerCase().equals("true")) { + finalTablesVisibleToImpala = true; + } else { + numberOfDownloadThreads = Integer.parseInt(parser.get("numberOfDownloadThreads")); + } + + UsageStatsExporter usagestatsExport = new UsageStatsExporter(); + usagestatsExport.export(); + } + + private static Calendar startingLogPeriodStr(Date date) { + + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + return calendar; + + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java new file mode 100644 index 000000000..4439f848e --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/IrusStats.java @@ -0,0 +1,69 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class IrusStats { + + private String irusUKURL; + + private static final Logger logger = LoggerFactory.getLogger(IrusStats.class); + + public IrusStats() throws Exception { + } + + public void processIrusStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating irus_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".irus_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created irus_downloads_stats_tmp table"); + + logger.info("Inserting into irus_downloads_stats_tmp"); + String insertDStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(YEAR(date), '/', LPAD(MONTH(date), 2, '0')) as date, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE s.repository=d.oid AND s.rid=ro.oid AND metric_type='ft_total' AND s.source='IRUS-UK'"; + stmt.executeUpdate(insertDStats); + logger.info("Inserted into irus_downloads_stats_tmp"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java new file mode 100644 index 000000000..0d34ebef3 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/LaReferenciaStats.java @@ -0,0 +1,145 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +import java.net.URLDecoder; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class LaReferenciaStats { + + private static final Logger logger = LoggerFactory.getLogger(LaReferenciaStats.class); + + private String logRepoPath; + + private Statement stmt = null; + + private String CounterRobotsURL; + private ArrayList robotsList; + + public LaReferenciaStats() throws Exception { + } + + public void processLogs() throws Exception { + try { + logger.info("LaReferencia creating viewsStats"); + viewsStats(); + logger.info("LaReferencia created viewsStats"); + logger.info("LaReferencia creating downloadsStats"); + downloadsStats(); + logger.info("LaReferencia created downloadsStats"); + +// logger.info("LaReferencia updating Production Tables"); +// updateProdTables(); +// logger.info("LaReferencia updated Production Tables"); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void viewsStats() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating la_result_views_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp AS " + + + "SELECT entity_id AS id, COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + + "THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='action' and " + + "(source_item_type='oaItem' or source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created la_result_views_monthly_tmp view"); + + logger.info("Dropping la_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped la_views_stats_tmp table"); + + logger.info("Creating la_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp " + + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.oid AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month"; + stmt.executeUpdate(sql); + logger.info("Created la_views_stats_tmp table"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + + private void downloadsStats() throws Exception { + + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating la_result_downloads_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".la_result_downloads_monthly_tmp AS " + + "SELECT entity_id AS id, COUNT(entity_id) as downloads, SUM(CASE WHEN referrer_name LIKE '%openaire%' " + + "THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".lareferencialog where action='download' and " + + "(source_item_type='oaItem' or source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created la_result_downloads_monthly_tmp view"); + + logger.info("Dropping la_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".la_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Dropped la_downloads_stats_tmp table"); + + logger.info("Creating la_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp " + + "AS SELECT 'LaReferencia' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.oid AND p.id=ro.oid " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month"; + stmt.executeUpdate(sql); + logger.info("Created la_downloads_stats_tmp table"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java new file mode 100644 index 000000000..253dc03b5 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/PiwikStatsDB.java @@ -0,0 +1,559 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.sql.Connection; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; +import java.util.*; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class PiwikStatsDB { + + private String logPath; + + private Statement stmt = null; + + private static final Logger logger = LoggerFactory.getLogger(PiwikStatsDB.class); + + public PiwikStatsDB() throws Exception { + + } + + public void recreateDBAndTables() throws Exception { + this.createDatabase(); + // The piwiklog table is not needed since it is built + // on top of JSON files + //////////// this.createTmpTables(); + } + + private void createDatabase() throws Exception { + +// try { +// +// stmt = ConnectDB.getHiveConnection().createStatement(); +// +// logger.info("Dropping usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); +// String dropDatabase = "DROP DATABASE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + " CASCADE"; +// stmt.executeUpdate(dropDatabase); +// } catch (Exception e) { +// logger.error("Failed to drop database: " + e); +// throw new Exception("Failed to drop database: " + e.toString(), e); +// } +// + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + logger.info("Creating usagestats DB: " + ConnectDB.getUsageStatsDBSchema()); + String createDatabase = "CREATE DATABASE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema(); + stmt.executeUpdate(createDatabase); + logger.info("Usagestats DB created: " + ConnectDB.getUsageStatsDBSchema()); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + + try { + stmt = ConnectDB.getHiveConnection().createStatement(); + + logger.info("Creating permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); + String createPermanentDatabase = "CREATE DATABASE IF NOT EXISTS " + + ConnectDB.getUsagestatsPermanentDBSchema(); + stmt.executeUpdate(createPermanentDatabase); + logger.info("Created permanent usagestats DB: " + ConnectDB.getUsagestatsPermanentDBSchema()); + + } catch (Exception e) { + logger.error("Failed to create database: " + e); + throw new Exception("Failed to create database: " + e.toString(), e); + } + } + + public void processLogs() throws Exception { + try { + + logger.info("ViewsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); + viewsStats(); + logger.info("ViewsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); + + logger.info("DownloadsStats processing starts at: " + new Timestamp(System.currentTimeMillis())); + downloadsStats(); + logger.info("DownloadsStats processing ends at: " + new Timestamp(System.currentTimeMillis())); + + } catch (Exception e) { + logger.error("Failed to process logs: " + e); + throw new Exception("Failed to process logs: " + e.toString(), e); + } + } + + public void viewsStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping openaire_result_views_monthly_tmp view"); + String drop_result_views_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_piwikresult_views_monthly_tmp"; + stmt.executeUpdate(drop_result_views_monthly); + logger.info("Dropped openaire_result_views_monthly_tmp view"); + + logger.info("Creating openaire_result_views_monthly_tmp view"); + String create_result_views_monthly = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_views_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) " + + "AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + + ".piwiklog where action='action' and (source_item_type='oaItem' or " + + "source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), " + + "source ORDER BY source, entity_id"; + stmt.executeUpdate(create_result_views_monthly); + logger.info("Created openaire_result_views_monthly_tmp table"); + + logger.info("Dropping openaire_views_stats_tmp table"); + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp"; + stmt.executeUpdate(drop_views_stats); + logger.info("Dropped openaire_views_stats_tmp table"); + + logger.info("Creating openaire_views_stats_tmp table"); + String create_views_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_views_stats_tmp " + + "AS SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id AND p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(create_views_stats); + logger.info("Created openaire_views_stats_tmp table"); + + logger.info("Creating openaire_pageviews_stats_tmp table"); + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_pageviews_stats_tmp AS SELECT " + + "'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, max(views) AS count " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=" + ExecuteWorkflow.portalMatomoID + + " AND p.source=d.piwik_id and p.id=ro.id AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(create_pageviews_stats); + logger.info("Created pageviews_stats table"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + + private void downloadsStats() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Dropping openaire_result_downloads_monthly_tmp view"); + String drop_result_downloads_monthly = "DROP VIEW IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp"; + stmt.executeUpdate(drop_result_downloads_monthly); + logger.info("Dropped openaire_result_downloads_monthly_tmp view"); + + logger.info("Creating openaire_result_downloads_monthly_tmp view"); + String sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + + ".openaire_result_downloads_monthly_tmp " + + "AS SELECT entity_id, " + + "reflect('java.net.URLDecoder', 'decode', entity_id) AS id," + + "COUNT(entity_id) as downloads, " + + "SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog where action='download' " + + "AND (source_item_type='oaItem' OR source_item_type='repItem') " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) , source " + + "ORDER BY source, entity_id, month"; + stmt.executeUpdate(sql); + logger.info("Created openaire_result_downloads_monthly_tmp view"); + + logger.info("Dropping openaire_downloads_stats_tmp table"); + String drop_views_stats = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".openaire_downloads_stats_tmp"; + stmt.executeUpdate(drop_views_stats); + logger.info("Dropped openaire_downloads_stats_tmp table"); + + logger.info("Creating openaire_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(downloads) AS count, max(openaire_referrer) AS openaire " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE p.source=d.piwik_id and p.id=ro.oid AND ro.oid!='200' " + + "GROUP BY d.id, ro.id, month " + + "ORDER BY d.id, ro.id, month "; + stmt.executeUpdate(sql); + logger.info("Created downloads_stats table"); + + logger.info("Dropping openaire_result_downloads_monthly_tmp view"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".openaire_result_downloads_monthly_tmp"; + logger.info("Dropped openaire_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + + public void uploadOldPedocs() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping Pedocs pedocs_views_stats_tmp table + logger.info("Dropping Pedocs pedocs_views_stats_tmp table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + logger.info("Dropped pedocs_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping Pedocs pedocs_downloads_stats table + logger.info("Dropping pedocs_downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats"; + logger.info("Dropped pedocs_downloads_stats table "); + stmt.executeUpdate(sql); + + // Creating Pedocs pedocs_views_stats_tmp table + logger.info("Creating Pedocs pedocs_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id,date,counter_abstract as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsoldviews p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_views_stats_tmp table "); + + // Creating Pedocs pedocs_downloads_stats_tmp table + logger.info("Creating Pedocs pedocs_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, 'opendoar____::ab1a4d0dd4d48a2ba1077c4494791306' as repository_id," + + "r.id as result_id, date, counter as count, 0 as openaire " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".pedocsolddownloads p, " + ConnectDB.getStatsDBSchema() + + ".result_oids r where r.oid=p.identifier"; + stmt.executeUpdate(sql); + logger.info("Created pedocs_downloads_stats_tmp table "); + + } + + public void uploadTUDELFTStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + String sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_views_stats_tmp table + logger.info("Dropping TUDELFT tudelft_views_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + logger.info("Dropped tudelft_views_stats_tmp table "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_downloads_stats_tmp table + logger.info("Dropping TUDELFT tudelft_downloads_stats_tmp table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + logger.info("Dropped tudelft_downloads_stats_tmp table "); + stmt.executeUpdate(sql); + + // Creating TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='action' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_views_monthly_tmp view "); + + // Creating TUDELFT tudelft_views_stats_tmp table + logger.info("Creating TUDELFT tudelft_views_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_views_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_views_stats_tmp table"); + + // Creating TUDELFT tudelft_result_downloads_monthly_tmp view + logger.info("Creating TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "CREATE OR REPLACE VIEW " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp " + + "AS SELECT entity_id, reflect('java.net.URLDecoder', 'decode', entity_id) AS id, " + + "COUNT(entity_id) as views, SUM(CASE WHEN referrer_name LIKE '%openaire%' THEN 1 ELSE 0 END) AS openaire_referrer, " + + "CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')) AS month, source " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".piwiklog " + + "WHERE action='download' and (source_item_type='oaItem' or source_item_type='repItem') and source=252 " + + "GROUP BY entity_id, CONCAT(YEAR(timestamp), '/', LPAD(MONTH(timestamp), 2, '0')), source ORDER BY source, entity_id"; + stmt.executeUpdate(sql); + logger.info("Created tudelft_result_downloads_monthly_tmp view "); + + // Creating TUDELFT tudelft_downloads_stats_tmp table + logger.info("Creating TUDELFT tudelft_downloads_stats_tmp table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp AS " + + "SELECT 'OpenAIRE' as source, d.id as repository_id, ro.id as result_id, month as date, " + + "max(views) AS count, max(openaire_referrer) AS openaire FROM " + ConnectDB.getUsageStatsDBSchema() + + ".tudelft_result_downloads_monthly_tmp p, " + + ConnectDB.getStatsDBSchema() + ".datasource d, " + ConnectDB.getStatsDBSchema() + ".result_oids ro " + + "WHERE concat('tud:',p.id)=ro.oid and d.id='opendoar____::c9892a989183de32e976c6f04e700201' " + + "GROUP BY d.id, ro.id, month ORDER BY d.id, ro.id"; + stmt.executeUpdate(sql); + logger.info("Created TUDELFT tudelft_downloads_stats_tmp table"); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_views_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_views_monthly_tmp"; + logger.info("Dropped tudelft_result_views_monthly_tmp view "); + stmt.executeUpdate(sql); + + // Dropping TUDELFT tudelft_result_views_monthly_tmp view + logger.info("Dropping TUDELFT tudelft_result_downloads_monthly_tmp view"); + sql = "DROP view IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_result_downloads_monthly_tmp"; + logger.info("Dropped tudelft_result_downloads_monthly_tmp view "); + stmt.executeUpdate(sql); + + } + + public void finalizeStats() throws Exception { + stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + // Dropping views_stats table + logger.info("Dropping views_stats table"); + String sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + logger.info("Dropped views_stats table "); + stmt.executeUpdate(sql); + + // Dropping downloads_stats table + logger.info("Dropping downloads_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + logger.info("Dropped downloads_stats table "); + stmt.executeUpdate(sql); + + // Dropping page_views_stats table + logger.info("Dropping pageviews_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + logger.info("Dropped pageviews_stats table "); + stmt.executeUpdate(sql); + + // Dropping usage_stats table + logger.info("Dropping usage_stats table"); + sql = "DROP TABLE IF EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + logger.info("Dropped usage_stats table "); + stmt.executeUpdate(sql); + + // Creating views_stats table + logger.info("Creating views_stats table"); + String createViewsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".views_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(createViewsStats); + logger.info("Created views_stats table"); + + // Inserting OpenAIRE views stats + logger.info("Inserting Openaire data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Openaire views updated to views_stats"); + + // Inserting Pedocs old views stats + logger.info("Inserting Pedocs old data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Pedocs views updated to views_stats"); + + // Inserting TUDELFT views stats + logger.info("Inserting TUDELFT data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("TUDELFT views updated to views_stats"); + + // Inserting Lareferencia views stats + logger.info("Inserting LaReferencia data to views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".views_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_views_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("LaReferencia views updated to views_stats"); + + logger.info("Creating downloads_stats table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".downloads_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created downloads_stats table"); + + // Inserting OpenAIRE downloads stats + logger.info("Inserting OpenAIRE data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted OpenAIRE data to downloads_stats"); + + // Inserting Pedocs old downloads stats + logger.info("Inserting PeDocs old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pedocs_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted Pedocs data to downloads_stats"); + + // Inserting TUDELFT downloads stats + logger.info("Inserting TUDELFT old data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".tudelft_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Inserted TUDELFT data to downloads_stats"); + + // Inserting Lareferencia downloads stats + logger.info("Inserting LaReferencia data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".la_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("Lareferencia downloads updated to downloads_stats"); + + // Inserting IRUS downloads stats + logger.info("Inserting IRUS data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".irus_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("IRUS downloads updated to downloads_stats"); + + // Inserting SARC-OJS downloads stats + logger.info("Inserting SARC data to downloads_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp"; + stmt.executeUpdate(sql); + logger.info("SARC-OJS downloads updated to downloads_stats"); + + logger.info("Creating pageviews_stats table"); + String create_pageviews_stats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".pageviews_stats " + + "LIKE " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp STORED AS PARQUET"; + stmt.executeUpdate(create_pageviews_stats); + logger.info("Created pageviews_stats table"); + + // Inserting OpenAIRE views stats from Portal + logger.info("Inserting data to page_views_stats"); + sql = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats " + + "SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".openaire_pageviews_stats_tmp"; + stmt.executeUpdate(sql); + + logger.info("Dropping full_dates table"); + String dropFullDates = "DROP TABLE IF EXISTS " + + ConnectDB.getUsageStatsDBSchema() + + ".full_dates"; + stmt.executeUpdate(dropFullDates); + logger.info("Dropped full_dates table"); + + Calendar startCalendar = Calendar.getInstance(); + startCalendar.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2016-01-01")); + Calendar endCalendar = Calendar.getInstance(); + int diffYear = endCalendar.get(Calendar.YEAR) - startCalendar.get(Calendar.YEAR); + int diffMonth = diffYear * 12 + endCalendar.get(Calendar.MONTH) - startCalendar.get(Calendar.MONTH); + + logger.info("Creating full_dates table"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".full_dates AS " + + "SELECT from_unixtime(unix_timestamp(cast(add_months(from_date,i) AS DATE)), 'yyyy/MM') AS txn_date " + + "FROM (SELECT DATE '2016-01-01' AS from_date) p " + + "LATERAL VIEW " + + "posexplode(split(space(" + diffMonth + "),' ')) pe AS i,x"; + stmt.executeUpdate(sql); + logger.info("Created full_dates table"); + + logger.info("Inserting data to usage_stats"); + sql = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats AS " + + "SELECT coalesce(ds.source, vs.source) as source, " + + "coalesce(ds.repository_id, vs.repository_id) as repository_id, " + + "coalesce(ds.result_id, vs.result_id) as result_id, coalesce(ds.date, vs.date) as date, " + + "coalesce(ds.count, 0) as downloads, coalesce(vs.count, 0) as views, " + + "coalesce(ds.openaire, 0) as openaire_downloads, " + + "coalesce(vs.openaire, 0) as openaire_views " + + "FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats AS ds FULL OUTER JOIN " + + ConnectDB.getUsageStatsDBSchema() + ".views_stats AS vs ON ds.source=vs.source " + + "AND ds.repository_id=vs.repository_id AND ds.result_id=vs.result_id AND ds.date=vs.date"; + stmt.executeUpdate(sql); + logger.info("Inserted data to usage_stats"); + + logger.info("Building views at permanent DB starts at: " + new Timestamp(System.currentTimeMillis())); + + logger.info("Dropping view views_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view views_stats on permanent usagestats DB"); + + logger.info("Create view views_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + logger.info("Created view views_stats on permanent usagestats DB"); + + logger.info("Dropping view pageviews_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view pageviews_stats on permanent usagestats DB"); + + logger.info("Create view pageviews_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + logger.info("Created view pageviews_stats on permanent usagestats DB"); + + logger.info("Dropping view downloads_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on downloads_stats on permanent usagestats DB"); + + logger.info("Create view on downloads_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on downloads_stats on permanent usagestats DB"); + + logger.info("Dropping view usage_stats on permanent usagestats DB"); + sql = "DROP VIEW IF EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Dropped view on usage_stats on permanent usagestats DB"); + + logger.info("Create view on usage_stats on permanent usagestats DB"); + sql = "CREATE VIEW IF NOT EXISTS " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats" + + " AS SELECT * FROM " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + logger.info("Created view on usage_stats on permanent usagestats DB"); + + logger.info("Building views at permanent DB ends at: " + new Timestamp(System.currentTimeMillis())); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } + + private Connection getConnection() throws SQLException { + return ConnectDB.getHiveConnection(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java new file mode 100644 index 000000000..880233f00 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/SarcStats.java @@ -0,0 +1,107 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.*; +// import java.io.BufferedReader; +// import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author D. Pierrakos, S. Zoupanos + */ +public class SarcStats { + + private Statement stmtHive = null; + private Statement stmtImpala = null; + + private static final Logger logger = LoggerFactory.getLogger(SarcStats.class); + + public SarcStats() throws Exception { +// createTables(); + } + + private void createTables() throws Exception { + try { + + stmtHive = ConnectDB.getHiveConnection().createStatement(); + String sqlCreateTableSushiLog = "CREATE TABLE IF NOT EXISTS sushilog(source TEXT, repository TEXT, rid TEXT, date TEXT, metric_type TEXT, count INT, PRIMARY KEY(source, repository, rid, date, metric_type));"; + stmtHive.executeUpdate(sqlCreateTableSushiLog); + + // String sqlCopyPublicSushiLog="INSERT INTO sushilog SELECT * FROM public.sushilog;"; + // stmt.executeUpdate(sqlCopyPublicSushiLog); + String sqlcreateRuleSushiLog = "CREATE OR REPLACE RULE ignore_duplicate_inserts AS " + + " ON INSERT TO sushilog " + + " WHERE (EXISTS ( SELECT sushilog.source, sushilog.repository," + + "sushilog.rid, sushilog.date " + + "FROM sushilog " + + "WHERE sushilog.source = new.source AND sushilog.repository = new.repository AND sushilog.rid = new.rid AND sushilog.date = new.date AND sushilog.metric_type = new.metric_type)) DO INSTEAD NOTHING;"; + stmtHive.executeUpdate(sqlcreateRuleSushiLog); + String createSushiIndex = "create index if not exists sushilog_duplicates on sushilog(source, repository, rid, date, metric_type);"; + stmtHive.executeUpdate(createSushiIndex); + + stmtHive.close(); + ConnectDB.getHiveConnection().close(); + logger.info("Sushi Tables Created"); + } catch (Exception e) { + logger.error("Failed to create tables: " + e); + throw new Exception("Failed to create tables: " + e.toString(), e); + } + } + + public void processSarc() throws Exception { + Statement stmt = ConnectDB.getHiveConnection().createStatement(); + ConnectDB.getHiveConnection().setAutoCommit(false); + + logger.info("Creating sarc_downloads_stats_tmp table"); + String createDownloadsStats = "CREATE TABLE IF NOT EXISTS " + ConnectDB.getUsageStatsDBSchema() + + ".sarc_downloads_stats_tmp " + + "(`source` string, " + + "`repository_id` string, " + + "`result_id` string, " + + "`date` string, " + + "`count` bigint, " + + "`openaire` bigint)"; + stmt.executeUpdate(createDownloadsStats); + logger.info("Created sarc_downloads_stats_tmp table"); + + logger.info("Inserting into sarc_downloads_stats_tmp"); + String insertSarcStats = "INSERT INTO " + ConnectDB.getUsageStatsDBSchema() + ".sarc_downloads_stats_tmp " + + "SELECT s.source, d.id AS repository_id, " + + "ro.id as result_id, CONCAT(CAST(YEAR(`date`) AS STRING), '/', " + + "LPAD(CAST(MONTH(`date`) AS STRING), 2, '0')) AS `date`, s.count, '0' " + + "FROM " + ConnectDB.getUsageRawDataDBSchema() + ".sushilog s, " + + ConnectDB.getStatsDBSchema() + ".datasource_oids d, " + + ConnectDB.getStatsDBSchema() + ".result_pids ro " + + "WHERE d.oid LIKE CONCAT('%', s.repository, '%') AND d.id like CONCAT('%', 'sarcservicod', '%') " + + "AND s.rid=ro.pid AND ro.type='Digital Object Identifier' AND s.metric_type='ft_total' AND s.source='SARC-OJS'"; + stmt.executeUpdate(insertSarcStats); + logger.info("Inserted into sarc_downloads_stats_tmp"); + + stmt.close(); + // ConnectDB.getHiveConnection().close(); + } + +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java new file mode 100644 index 000000000..47986f52a --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/java/eu/dnetlib/oa/graph/usagestatsbuild/export/UsageStatsExporter.java @@ -0,0 +1,127 @@ + +package eu.dnetlib.oa.graph.usagestatsbuild.export; + +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main class for downloading and processing Usage statistics + * + * @author D. Pierrakos, S. Zoupanos + */ +public class UsageStatsExporter { + + public UsageStatsExporter() { + + } + + private static final Logger logger = LoggerFactory.getLogger(UsageStatsExporter.class); + + public void export() throws Exception { + + logger.info("Initialising DB properties"); + ConnectDB.init(); + +// runImpalaQuery(); + PiwikStatsDB piwikstatsdb = new PiwikStatsDB(); + + logger.info("Re-creating database and tables"); + if (ExecuteWorkflow.recreateDbAndTables) { + piwikstatsdb.recreateDBAndTables(); + logger.info("DB-Tables are created "); + } +// else { +// piwikstatsdb.createTmpTables(); +// logger.info("TmpTables are created "); +// } + if (ExecuteWorkflow.processPiwikLogs) { + logger.info("Processing Piwik logs"); + piwikstatsdb.processLogs(); + logger.info("Piwik logs Done"); + logger.info("Processing Pedocs Old Stats"); + piwikstatsdb.uploadOldPedocs(); + logger.info("Processing Pedocs Old Stats Done"); + logger.info("Processing TUDELFT Stats"); + piwikstatsdb.uploadTUDELFTStats(); + logger.info("Processing TUDELFT Stats Done"); + + } + + LaReferenciaStats lastats = new LaReferenciaStats(); + + if (ExecuteWorkflow.processLaReferenciaLogs) { + logger.info("Processing LaReferencia logs"); + lastats.processLogs(); + logger.info("LaReferencia logs done"); + } + + IrusStats irusstats = new IrusStats(); + + if (ExecuteWorkflow.irusProcessStats) { + logger.info("Processing IRUS"); + irusstats.processIrusStats(); + logger.info("Irus done"); + } + + SarcStats sarcStats = new SarcStats(); + + if (ExecuteWorkflow.sarcProcessStats) { + sarcStats.processSarc(); + } + logger.info("Sarc done"); + + // finalize usagestats + if (ExecuteWorkflow.finalizeStats) { + piwikstatsdb.finalizeStats(); + logger.info("Finalized stats"); + } + + // Make the tables available to Impala + if (ExecuteWorkflow.finalTablesVisibleToImpala) { + logger.info("Making tables visible to Impala"); + invalidateMetadata(); + } + + logger.info("End"); + } + + private void invalidateMetadata() throws SQLException { + Statement stmt = null; + + stmt = ConnectDB.getImpalaConnection().createStatement(); + + String sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsageStatsDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".downloads_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".views_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".usage_stats"; + stmt.executeUpdate(sql); + + sql = "INVALIDATE METADATA " + ConnectDB.getUsagestatsPermanentDBSchema() + ".pageviews_stats"; + stmt.executeUpdate(sql); + + stmt.close(); + ConnectDB.getHiveConnection().close(); + } +} diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json new file mode 100644 index 000000000..407370ada --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/export/usagestatsbuild_parameters.json @@ -0,0 +1,128 @@ +[ + { + "paramName": "rlp", + "paramLongName": "repoLogPath", + "paramDescription": "nameNode of the source cluster", + "paramRequired": true + }, + { + "paramName": "plp", + "paramLongName": "portalLogPath", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "pmi", + "paramLongName": "portalMatomoID", + "paramDescription": "namoNode of the target cluster", + "paramRequired": true + }, + { + "paramName": "iukrp", + "paramLongName": "irusUKReportPath", + "paramDescription": "maximum number of map tasks used in the distcp process", + "paramRequired": true + }, + { + "paramName": "srpa", + "paramLongName": "sarcsReportPathArray", + "paramDescription": "memory for distcp action copying actionsets from remote cluster", + "paramRequired": true + }, + { + "paramName": "srpna", + "paramLongName": "sarcsReportPathNonArray", + "paramDescription": "timeout for distcp copying actions from remote cluster", + "paramRequired": true + }, + { + "paramName": "llp", + "paramLongName": "lareferenciaLogPath", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbhu", + "paramLongName": "dbHiveUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "dbiu", + "paramLongName": "dbImpalaUrl", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "urdbs", + "paramLongName": "usageRawDataDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "usdbs", + "paramLongName": "usageStatsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "sdbs", + "paramLongName": "statsDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "uspdbs", + "paramLongName": "usagestatsPermanentDBSchema", + "paramDescription": "activate tranform-only mode. Only apply transformation step", + "paramRequired": true + }, + { + "paramName": "rdbt", + "paramLongName": "recreateDbAndTables", + "paramDescription": "Re-create database and initial tables?", + "paramRequired": true + }, + { + "paramName": "ppwl", + "paramLongName": "processPiwikLogs", + "paramDescription": "Process the piwiklogs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "plrl", + "paramLongName": "processLaReferenciaLogs", + "paramDescription": "Process the La Referencia logs (create & fill in the needed tables and process the data) based on the downloaded data", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "irusProcessStats", + "paramDescription": "Irus section: Process stats?", + "paramRequired": true + }, + { + "paramName": "ipr", + "paramLongName": "sarcProcessStats", + "paramDescription": "Sarc section: Process stats?", + "paramRequired": true + }, + { + "paramName": "fs", + "paramLongName": "finalizeStats", + "paramDescription": "Create the usage_stats table?", + "paramRequired": true + }, + { + "paramName": "ftvi", + "paramLongName": "finalTablesVisibleToImpala", + "paramDescription": "Make the usage_stats, views_stats and downloads_stats tables visible to Impala", + "paramRequired": true + }, + { + "paramName": "nodt", + "paramLongName": "numberOfDownloadThreads", + "paramDescription": "Number of download threads", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml new file mode 100644 index 000000000..b5c807378 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/config-default.xml @@ -0,0 +1,38 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1 + + + impalaJdbcUrl + jdbc:hive2://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/;auth=noSasl; + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + oozie.use.system.libpath + true + + diff --git a/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml new file mode 100644 index 000000000..71e8a50d6 --- /dev/null +++ b/dhp-workflows/dhp-usage-stats-build/src/main/resources/eu/dnetlib/dhp/oa/graph/usagestatsbuild/oozie_app/workflow.xml @@ -0,0 +1,73 @@ + + + + hiveMetastoreUris + Hive server metastore URIs + + + hiveJdbcUrl + Hive server jdbc url + + + impalaJdbcUrl + Impala server jdbc url + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hiveMetastoreUris} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + eu.dnetlib.oa.graph.usagestatsbuild.export.ExecuteWorkflow + --repoLogPath${repoLogPath} + --portalLogPath${portalLogPath} + --portalMatomoID${portalMatomoID} + --irusUKReportPath${irusUKReportPath} + --sarcsReportPathArray${sarcsReportPathArray} + --sarcsReportPathNonArray${sarcsReportPathNonArray} + --lareferenciaLogPath${lareferenciaLogPath} + --dbHiveUrl${hiveJdbcUrl} + --dbImpalaUrl${impalaJdbcUrl} + --usageRawDataDBSchema${usageRawDataDBSchema} + --usageStatsDBSchema${usageStatsDBSchema} + --usagestatsPermanentDBSchema${usagestatsPermanentDBSchema} + --statsDBSchema${statsDBSchema} + --recreateDbAndTables${recreateDbAndTables} + --processPiwikLogs${processPiwikLogs} + --processLaReferenciaLogs${processLaReferenciaLogs} + --irusProcessStats${irusProcessStats} + --sarcProcessStats${sarcProcessStats} + --finalizeStats${finalizeStats} + --finalTablesVisibleToImpala${finalTablesVisibleToImpala} + --numberOfDownloadThreads${numberOfDownloadThreads} + + + + + + + + diff --git a/pom.xml b/pom.xml index a2e2587b3..3e0626aed 100644 --- a/pom.xml +++ b/pom.xml @@ -704,7 +704,7 @@ 3.3.3 3.4.2 [2.12,3.0) - 3.1.1 + 3.1.6 7.5.0 4.7.2 1.20