forked from antonis.lempesis/dnet-hadoop
Merge branch 'stable_ids' into import_new_mdstores
This commit is contained in:
commit
f0fbfdcfae
|
@ -21,6 +21,10 @@
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-common</artifactId>
|
<artifactId>hadoop-common</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
|
|
@ -7,11 +7,13 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.validator.GenericValidator;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
@ -115,7 +117,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
||||||
}
|
}
|
||||||
} else if (value instanceof Relation) {
|
} else if (value instanceof Relation) {
|
||||||
// nothing to clean here
|
Relation r = (Relation) value;
|
||||||
|
|
||||||
|
if (!isValidDate(r.getValidationDate())) {
|
||||||
|
r.setValidationDate(null);
|
||||||
|
r.setValidated(false);
|
||||||
|
}
|
||||||
|
|
||||||
} else if (value instanceof Result) {
|
} else if (value instanceof Result) {
|
||||||
|
|
||||||
Result r = (Result) value;
|
Result r = (Result) value;
|
||||||
|
@ -292,6 +300,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected static boolean isValidDate(String date) {
|
||||||
|
return Stream
|
||||||
|
.of(ModelSupport.DATE_TIME_FORMATS)
|
||||||
|
.anyMatch(format -> GenericValidator.isDate(date, format, false));
|
||||||
|
}
|
||||||
|
|
||||||
// HELPERS
|
// HELPERS
|
||||||
|
|
||||||
private static boolean isValidAuthorName(Author a) {
|
private static boolean isValidAuthorName(Author a) {
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -15,16 +16,23 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class OafMapperUtilsTest {
|
public class OafMapperUtilsTest {
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDateValidation() {
|
||||||
|
|
||||||
|
assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z"));
|
||||||
|
assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52"));
|
||||||
|
assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05"));
|
||||||
|
assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMergePubs() throws IOException {
|
public void testMergePubs() throws IOException {
|
||||||
Publication p1 = read("publication_1.json", Publication.class);
|
Publication p1 = read("publication_1.json", Publication.class);
|
||||||
|
|
|
@ -56,6 +56,7 @@ object ImportDatacite {
|
||||||
val hdfsTargetPath = new Path(targetPath)
|
val hdfsTargetPath = new Path(targetPath)
|
||||||
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
||||||
|
|
||||||
|
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
|
||||||
|
|
||||||
val spkipImport = parser.get("skipImport")
|
val spkipImport = parser.get("skipImport")
|
||||||
log.info(s"skipImport is $spkipImport")
|
log.info(s"skipImport is $spkipImport")
|
||||||
|
@ -110,7 +111,7 @@ object ImportDatacite {
|
||||||
|
|
||||||
println(s"last Timestamp is $ts")
|
println(s"last Timestamp is $ts")
|
||||||
|
|
||||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf)
|
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||||
|
|
||||||
println(s"Imported from Datacite API $cnt documents")
|
println(s"Imported from Datacite API $cnt documents")
|
||||||
|
|
||||||
|
@ -137,7 +138,7 @@ object ImportDatacite {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = {
|
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
|
||||||
var from:Long = timestamp * 1000
|
var from:Long = timestamp * 1000
|
||||||
val delta:Long = 50000000L
|
val delta:Long = 50000000L
|
||||||
var client: DataciteAPIImporter = null
|
var client: DataciteAPIImporter = null
|
||||||
|
@ -148,7 +149,7 @@ object ImportDatacite {
|
||||||
try {
|
try {
|
||||||
var start: Long = System.currentTimeMillis
|
var start: Long = System.currentTimeMillis
|
||||||
while (from < now) {
|
while (from < now) {
|
||||||
client = new DataciteAPIImporter(from, 100, from + delta)
|
client = new DataciteAPIImporter(from, bs, from + delta)
|
||||||
var end: Long = 0
|
var end: Long = 0
|
||||||
val key: IntWritable = new IntWritable(i)
|
val key: IntWritable = new IntWritable(i)
|
||||||
val value: Text = new Text
|
val value: Text = new Text
|
||||||
|
|
|
@ -143,7 +143,6 @@ public class PrepareProgramme {
|
||||||
|
|
||||||
JavaRDD<CSVProgramme> h2020Programmes = programme
|
JavaRDD<CSVProgramme> h2020Programmes = programme
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020"))
|
|
||||||
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
|
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
|
||||||
.reduceByKey((a, b) -> {
|
.reduceByKey((a, b) -> {
|
||||||
if (!a.getLanguage().equals("en")) {
|
if (!a.getLanguage().equals("en")) {
|
||||||
|
|
|
@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
@ -32,7 +31,6 @@ public class PrepareProjects {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
private static final HashMap<String, CSVProgramme> programmeMap = new HashMap<>();
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,6 @@ public class SparkAtomicActionJob {
|
||||||
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
|
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
|
||||||
|
|
||||||
CSVProject csvProject = c._1();
|
CSVProject csvProject = c._1();
|
||||||
Optional<CSVProgramme> ocsvProgramme = Optional.ofNullable(c._2());
|
|
||||||
|
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(c._2())
|
.ofNullable(c._2())
|
||||||
|
@ -135,9 +134,9 @@ public class SparkAtomicActionJob {
|
||||||
H2020Programme pm = new H2020Programme();
|
H2020Programme pm = new H2020Programme();
|
||||||
H2020Classification h2020classification = new H2020Classification();
|
H2020Classification h2020classification = new H2020Classification();
|
||||||
pm.setCode(csvProject.getProgramme());
|
pm.setCode(csvProject.getProgramme());
|
||||||
h2020classification.setClassification(ocsvProgramme.get().getClassification());
|
h2020classification.setClassification(csvProgramme.getClassification());
|
||||||
h2020classification.setH2020Programme(pm);
|
h2020classification.setH2020Programme(pm);
|
||||||
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
|
setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
|
||||||
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
||||||
pp.setH2020classification(Arrays.asList(h2020classification));
|
pp.setH2020classification(Arrays.asList(h2020classification));
|
||||||
|
|
||||||
|
@ -145,10 +144,11 @@ public class SparkAtomicActionJob {
|
||||||
})
|
})
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
||||||
}, Encoders.bean(Project.class));
|
}, Encoders.bean(Project.class))
|
||||||
|
.filter(Objects::nonNull);
|
||||||
|
|
||||||
aaproject
|
aaproject
|
||||||
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")))
|
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
|
||||||
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
|
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
|
||||||
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
|
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
|
||||||
Project rp = p._1();
|
Project rp = p._1();
|
||||||
|
|
|
@ -7,14 +7,7 @@ import java.io.Serializable;
|
||||||
* The model for the programme csv file
|
* The model for the programme csv file
|
||||||
*/
|
*/
|
||||||
public class CSVProgramme implements Serializable {
|
public class CSVProgramme implements Serializable {
|
||||||
private String parentProgramme;
|
|
||||||
private String frameworkProgramme;
|
|
||||||
private String startDate;
|
|
||||||
private String endDate;
|
|
||||||
private String objective;
|
|
||||||
private String subjects;
|
|
||||||
private String legalBasis;
|
|
||||||
private String call;
|
|
||||||
private String rcn;
|
private String rcn;
|
||||||
private String code;
|
private String code;
|
||||||
|
|
||||||
|
@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable {
|
||||||
this.language = language;
|
this.language = language;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getParentProgramme() {
|
//
|
||||||
return parentProgramme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setParentProgramme(String parentProgramme) {
|
|
||||||
this.parentProgramme = parentProgramme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getFrameworkProgramme() {
|
|
||||||
return frameworkProgramme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
|
||||||
this.frameworkProgramme = frameworkProgramme;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getStartDate() {
|
|
||||||
return startDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setStartDate(String startDate) {
|
|
||||||
this.startDate = startDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getEndDate() {
|
|
||||||
return endDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEndDate(String endDate) {
|
|
||||||
this.endDate = endDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getObjective() {
|
|
||||||
return objective;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setObjective(String objective) {
|
|
||||||
this.objective = objective;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSubjects() {
|
|
||||||
return subjects;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSubjects(String subjects) {
|
|
||||||
this.subjects = subjects;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLegalBasis() {
|
|
||||||
return legalBasis;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLegalBasis(String legalBasis) {
|
|
||||||
this.legalBasis = legalBasis;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCall() {
|
|
||||||
return call;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCall(String call) {
|
|
||||||
this.call = call;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,6 @@ public class EXCELParser {
|
||||||
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
||||||
InvalidFormatException {
|
InvalidFormatException {
|
||||||
|
|
||||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
|
||||||
OPCPackage pkg = OPCPackage.open(file);
|
OPCPackage pkg = OPCPackage.open(file);
|
||||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||||
|
|
||||||
|
@ -58,7 +57,6 @@ public class EXCELParser {
|
||||||
|
|
||||||
for (int i = 0; i < headers.size(); i++) {
|
for (int i = 0; i < headers.size(); i++) {
|
||||||
Cell cell = row.getCell(i);
|
Cell cell = row.getCell(i);
|
||||||
String value = dataFormatter.formatCellValue(cell);
|
|
||||||
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,12 @@
|
||||||
"paramDescription": "avoid to downlaod new items but apply the previous update",
|
"paramDescription": "avoid to downlaod new items but apply the previous update",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "bs",
|
||||||
|
"paramLongName": "blocksize",
|
||||||
|
"paramDescription": "define the requests block size",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "n",
|
"paramName": "n",
|
||||||
"paramLongName": "namenode",
|
"paramLongName": "namenode",
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="H2020Programme" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>projectFileURL</name>
|
<name>projectFileURL</name>
|
||||||
|
@ -18,6 +18,10 @@
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>path where to store the action set</description>
|
<description>path where to store the action set</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sheetName</name>
|
||||||
|
<description>the name of the sheet to read</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="deleteoutputpath"/>
|
<start to="deleteoutputpath"/>
|
||||||
|
@ -31,10 +35,23 @@
|
||||||
<delete path='${workingDir}'/>
|
<delete path='${workingDir}'/>
|
||||||
<mkdir path='${workingDir}'/>
|
<mkdir path='${workingDir}'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="get_project_file"/>
|
<ok to="fork_get_info"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<fork name="fork_get_info">
|
||||||
|
<path start="fork_get_projects"/>
|
||||||
|
<path start="get_programme_file"/>
|
||||||
|
<path start="get_topic_file"/>
|
||||||
|
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<fork name="fork_get_projects">
|
||||||
|
<path start="get_project_file"/>
|
||||||
|
<path start="read_projects"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
<action name="get_project_file">
|
<action name="get_project_file">
|
||||||
<java>
|
<java>
|
||||||
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
|
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
|
||||||
|
@ -43,7 +60,7 @@
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
|
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
|
||||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
|
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="get_programme_file"/>
|
<ok to="wait_projects"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -55,7 +72,7 @@
|
||||||
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
|
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
|
||||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
|
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="get_topic_file"/>
|
<ok to="prepare_programme"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -68,7 +85,7 @@
|
||||||
<arg>--sheetName</arg><arg>${sheetName}</arg>
|
<arg>--sheetName</arg><arg>${sheetName}</arg>
|
||||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
|
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="read_projects"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -81,7 +98,7 @@
|
||||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="prepare_programme"/>
|
<ok to="wait_projects"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -105,10 +122,15 @@
|
||||||
<arg>--programmePath</arg><arg>${workingDir}/programme</arg>
|
<arg>--programmePath</arg><arg>${workingDir}/programme</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="prepare_project"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<join name="wait" to="create_updates"/>
|
||||||
|
|
||||||
|
<join name="wait_projects" to="prepare_project"/>
|
||||||
|
|
||||||
|
|
||||||
<action name="prepare_project">
|
<action name="prepare_project">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
|
@ -130,7 +152,7 @@
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
||||||
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="create_updates"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,8 @@ import eu.dnetlib.dhp.collection.HttpConnector2;
|
||||||
public class EXCELParserTest {
|
public class EXCELParserTest {
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
private final HttpConnector2 httpConnector = new HttpConnector2();
|
private HttpConnector2 httpConnector = new HttpConnector2();
|
||||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx";
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void beforeAll() throws IOException {
|
public static void beforeAll() throws IOException {
|
||||||
|
@ -35,11 +35,12 @@ public class EXCELParserTest {
|
||||||
|
|
||||||
EXCELParser excelParser = new EXCELParser();
|
EXCELParser excelParser = new EXCELParser();
|
||||||
|
|
||||||
final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic";
|
List<Object> pl = excelParser
|
||||||
final String sheetName = "Topics";
|
.parse(
|
||||||
List<Object> pl = excelParser.parse(httpConnector.getInputSourceAsStream(URL), classForName, sheetName);
|
httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic",
|
||||||
|
"Topics");
|
||||||
|
|
||||||
Assertions.assertEquals(3837, pl.size());
|
Assertions.assertEquals(3878, pl.size());
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
@ -144,7 +145,7 @@ public class ConversionUtils {
|
||||||
.filter(pid -> pid != null)
|
.filter(pid -> pid != null)
|
||||||
.filter(pid -> pid.getQualifier() != null)
|
.filter(pid -> pid.getQualifier() != null)
|
||||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
.filter(pid -> pid.getQualifier().getClassid() != null)
|
||||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
|
||||||
.map(pid -> pid.getValue())
|
.map(pid -> pid.getValue())
|
||||||
.map(pid -> cleanOrcid(pid))
|
.map(pid -> cleanOrcid(pid))
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
|
|
@ -93,7 +93,7 @@ public class PublicationToOaf implements Serializable {
|
||||||
{
|
{
|
||||||
put(
|
put(
|
||||||
ModelConstants.ORCID,
|
ModelConstants.ORCID,
|
||||||
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -126,8 +126,6 @@ public class PublicationToOaf implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final String PID_TYPES = "dnet:pid_types";
|
|
||||||
|
|
||||||
public Oaf generatePublicationActionsFromJson(final String json) {
|
public Oaf generatePublicationActionsFromJson(final String json) {
|
||||||
if (parsedPublications != null) {
|
if (parsedPublications != null) {
|
||||||
parsedPublications.add(1);
|
parsedPublications.add(1);
|
||||||
|
|
|
@ -24,8 +24,6 @@ public class Constants {
|
||||||
|
|
||||||
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
|
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
|
||||||
|
|
||||||
public static String ORCID = "orcid";
|
|
||||||
|
|
||||||
static {
|
static {
|
||||||
accessRightsCoarMap.put("OPEN", "c_abf2");
|
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||||
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||||
|
|
|
@ -503,7 +503,7 @@ public class ResultMapper implements Serializable {
|
||||||
|
|
||||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||||
for (StructuredProperty pid : p) {
|
for (StructuredProperty pid : p) {
|
||||||
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
|
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||||
if (di.isPresent()) {
|
if (di.isPresent()) {
|
||||||
return Pid
|
return Pid
|
||||||
|
|
|
@ -76,7 +76,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||||
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
|
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
|
||||||
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||||
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
|
ORCID_PENDING, ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES);
|
||||||
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
||||||
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
@ -56,7 +57,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
author.setPid(new ArrayList<>());
|
author.setPid(new ArrayList<>());
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(pid)) {
|
if (StringUtils.isNotBlank(pid)) {
|
||||||
if (type.startsWith("ORCID")) {
|
if (type.toLowerCase().startsWith(ORCID)) {
|
||||||
final String cleanedId = pid
|
final String cleanedId = pid
|
||||||
.replaceAll("http://orcid.org/", "")
|
.replaceAll("http://orcid.org/", "")
|
||||||
.replaceAll("https://orcid.org/", "");
|
.replaceAll("https://orcid.org/", "");
|
||||||
|
|
|
@ -13,6 +13,7 @@ import org.dom4j.Node;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
@ -85,7 +86,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
.replaceAll(" ", "")
|
.replaceAll(" ", "")
|
||||||
.replaceAll("_", "");
|
.replaceAll("_", "");
|
||||||
|
|
||||||
if (type.startsWith("ORCID")) {
|
if (type.toLowerCase().startsWith(ORCID)) {
|
||||||
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
|
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
|
||||||
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
||||||
} else if (type.startsWith("MAGID")) {
|
} else if (type.startsWith("MAGID")) {
|
||||||
|
|
|
@ -99,8 +99,8 @@ public class MappersTest {
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.get();
|
.get();
|
||||||
assertEquals("0000-0001-6651-1178", pid.getValue());
|
assertEquals("0000-0001-6651-1178", pid.getValue());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassid());
|
assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid());
|
||||||
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
|
assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
assertEquals("Votsi,Nefta", author.get().getFullname());
|
assertEquals("Votsi,Nefta", author.get().getFullname());
|
||||||
|
@ -280,8 +280,8 @@ public class MappersTest {
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.get();
|
.get();
|
||||||
assertEquals("0000-0001-9074-1619", pid.getValue());
|
assertEquals("0000-0001-9074-1619", pid.getValue());
|
||||||
assertEquals("ORCID", pid.getQualifier().getClassid());
|
assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid());
|
||||||
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
|
assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||||
assertEquals("Baracchini, Theo", author.get().getFullname());
|
assertEquals("Baracchini, Theo", author.get().getFullname());
|
||||||
|
|
|
@ -1160,6 +1160,27 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.asXmlElement(
|
.asXmlElement(
|
||||||
"distributionlocation", instance.getDistributionlocation()));
|
"distributionlocation", instance.getDistributionlocation()));
|
||||||
}
|
}
|
||||||
|
if (instance.getPid() != null) {
|
||||||
|
fields
|
||||||
|
.addAll(
|
||||||
|
instance
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
if (instance.getAlternateIdentifier() != null) {
|
||||||
|
fields
|
||||||
|
.addAll(
|
||||||
|
instance
|
||||||
|
.getAlternateIdentifier()
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(p -> XmlSerializationUtils.mapStructuredProperty("alternateidentifier", p))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
|
if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
|
||||||
fields
|
fields
|
||||||
.add(
|
.add(
|
||||||
|
|
|
@ -61,6 +61,11 @@ public class XmlRecordFactoryTest {
|
||||||
Assertions.assertEquals("0000-0001-9613-9956", doc.valueOf("//creator[@rank = '2']/@orcid"));
|
Assertions.assertEquals("0000-0001-9613-9956", doc.valueOf("//creator[@rank = '2']/@orcid"));
|
||||||
Assertions.assertEquals("", doc.valueOf("//creator[@rank = '2']/@orcid_pending"));
|
Assertions.assertEquals("", doc.valueOf("//creator[@rank = '2']/@orcid_pending"));
|
||||||
|
|
||||||
|
Assertions.assertEquals("doi", doc.valueOf("//instance/pid/@classid"));
|
||||||
|
Assertions.assertEquals("10.1109/TED.2018.2853550", doc.valueOf("//instance/pid/text()"));
|
||||||
|
|
||||||
|
Assertions.assertEquals("doi", doc.valueOf("//instance/alternateidentifier/@classid"));
|
||||||
|
Assertions.assertEquals("10.5689/LIB.2018.2853550", doc.valueOf("//instance/alternateidentifier/text()"));
|
||||||
// TODO add assertions based of values extracted from the XML record
|
// TODO add assertions based of values extracted from the XML record
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -284,6 +284,54 @@
|
||||||
"id": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c",
|
"id": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c",
|
||||||
"instance": [
|
"instance": [
|
||||||
{
|
{
|
||||||
|
"pid": [
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "",
|
||||||
|
"classname": "",
|
||||||
|
"schemeid": "",
|
||||||
|
"schemename": ""
|
||||||
|
},
|
||||||
|
"trust": ""
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1109/TED.2018.2853550"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"alternateIdentifier": [
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "",
|
||||||
|
"classname": "",
|
||||||
|
"schemeid": "",
|
||||||
|
"schemename": ""
|
||||||
|
},
|
||||||
|
"trust": ""
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.5689/LIB.2018.2853550"
|
||||||
|
}
|
||||||
|
],
|
||||||
"accessright": {
|
"accessright": {
|
||||||
"classid": "OPEN",
|
"classid": "OPEN",
|
||||||
"classname": "Open Access",
|
"classname": "Open Access",
|
||||||
|
|
|
@ -16,6 +16,14 @@
|
||||||
<name>monitor_db_production_name</name>
|
<name>monitor_db_production_name</name>
|
||||||
<description>the name of the monitor public database</description>
|
<description>the name of the monitor public database</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>observatory_db_name</name>
|
||||||
|
<description>the monitor database name</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>observatory_db_production_name</name>
|
||||||
|
<description>the name of the monitor public database</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>stats_tool_api_url</name>
|
<name>stats_tool_api_url</name>
|
||||||
<description>The url of the API of the stats tool. Is used to trigger the cache promote.</description>
|
<description>The url of the API of the stats tool. Is used to trigger the cache promote.</description>
|
||||||
|
@ -77,6 +85,19 @@
|
||||||
<argument>${monitor_db_production_name}</argument>
|
<argument>${monitor_db_production_name}</argument>
|
||||||
<file>updateProductionViews.sh</file>
|
<file>updateProductionViews.sh</file>
|
||||||
</shell>
|
</shell>
|
||||||
|
<ok to="updateObservatoryViews"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="updateObservatoryViews">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>updateProductionViews.sh</exec>
|
||||||
|
<argument>${observatory_db_name}</argument>
|
||||||
|
<argument>${observatory_db_production_name}</argument>
|
||||||
|
<file>updateProductionViews.sh</file>
|
||||||
|
</shell>
|
||||||
<ok to="promoteCache"/>
|
<ok to="promoteCache"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||||
|
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||||
|
if ! [ -L $link_folder ]
|
||||||
|
then
|
||||||
|
rm -Rf "$link_folder"
|
||||||
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||||
|
fi
|
||||||
|
|
||||||
|
export SOURCE=$1
|
||||||
|
export TARGET=$2
|
||||||
|
export SHADOW=$3
|
||||||
|
export SCRIPT_PATH=$4
|
||||||
|
|
||||||
|
echo "Getting file from " $4
|
||||||
|
hdfs dfs -copyToLocal $4
|
||||||
|
|
||||||
|
echo "Creating observatory database"
|
||||||
|
impala-shell -q "drop database if exists ${TARGET} cascade"
|
||||||
|
impala-shell -q "create database if not exists ${TARGET}"
|
||||||
|
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
|
||||||
|
cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f -
|
||||||
|
echo "Impala shell finished"
|
||||||
|
|
||||||
|
echo "Updating shadow observatory database"
|
||||||
|
impala-shell -q "create database if not exists ${SHADOW}"
|
||||||
|
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
|
||||||
|
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
|
||||||
|
echo "Shadow db ready!"
|
|
@ -45,35 +45,3 @@ FROM ${stats_db_name}.dataset
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT *, bestlicence AS access_mode
|
SELECT *, bestlicence AS access_mode
|
||||||
FROM ${stats_db_name}.otherresearchproduct;
|
FROM ${stats_db_name}.otherresearchproduct;
|
||||||
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
-- To see with Antonis if the following is needed and where it should be placed
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
CREATE TABLE ${stats_db_name}.numbers_country AS
|
|
||||||
SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications
|
|
||||||
FROM ${stats_db_name}.result r,
|
|
||||||
${stats_db_name}.result_datasources rd,
|
|
||||||
${stats_db_name}.datasource d,
|
|
||||||
${stats_db_name}.datasource_organizations dor,
|
|
||||||
${stats_db_name}.organization org
|
|
||||||
WHERE r.id = rd.id
|
|
||||||
AND rd.datasource = d.id
|
|
||||||
AND d.id = dor.id
|
|
||||||
AND dor.organization = org.id
|
|
||||||
AND r.type = 'publication'
|
|
||||||
AND r.bestlicence = 'Open Access'
|
|
||||||
GROUP BY org.country;
|
|
||||||
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS;
|
|
|
@ -60,32 +60,3 @@ union all
|
||||||
select distinct r.id, false as gold
|
select distinct r.id, false as gold
|
||||||
from ${stats_db_name}.result r
|
from ${stats_db_name}.result r
|
||||||
where r.id not in (select id from result_gold);
|
where r.id not in (select id from result_gold);
|
||||||
|
|
||||||
-- shortcut result-country through the organization affiliation
|
|
||||||
create table ${stats_db_name}.result_affiliated_country as
|
|
||||||
select r.id as id, o.country as country
|
|
||||||
from ${stats_db_name}.result r
|
|
||||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
|
||||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
|
||||||
where o.country is not null and o.country!='';
|
|
||||||
|
|
||||||
-- shortcut result-country through datasource of deposition
|
|
||||||
create table ${stats_db_name}.result_deposited_country as
|
|
||||||
select r.id as id, o.country as country
|
|
||||||
from ${stats_db_name}.result r
|
|
||||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
|
||||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
|
||||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
|
||||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
|
||||||
where o.country is not null and o.country!='';
|
|
||||||
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS;
|
|
|
@ -53,6 +53,3 @@ drop table if exists ${stats_db_name}.result;
|
||||||
drop view if exists ${stats_db_name}.result;
|
drop view if exists ${stats_db_name}.result;
|
||||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||||
drop table ${stats_db_name}.result_tmp;
|
drop table ${stats_db_name}.result_tmp;
|
||||||
--
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS;
|
|
||||||
-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS;
|
|
|
@ -19,9 +19,6 @@ create table TARGET.result as
|
||||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo;
|
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo;
|
||||||
compute stats TARGET.result;
|
compute stats TARGET.result;
|
||||||
|
|
||||||
create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id);
|
|
||||||
compute stats TARGET.result_affiliated_country;
|
|
||||||
|
|
||||||
create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_citations;
|
compute stats TARGET.result_citations;
|
||||||
|
|
||||||
|
@ -34,9 +31,6 @@ compute stats TARGET.result_concepts;
|
||||||
create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_datasources;
|
compute stats TARGET.result_datasources;
|
||||||
|
|
||||||
create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
|
||||||
compute stats TARGET.result_deposited_country;
|
|
||||||
|
|
||||||
create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_fundercount;
|
compute stats TARGET.result_fundercount;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,259 @@
|
||||||
|
create table TARGET.result_affiliated_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_year stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_year_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_datasource stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
left outer join SOURCE.datasource d on d.id=rd.datasource
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_datasource_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
left outer join SOURCE.datasource d on d.id=rd.datasource
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_organization stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, o.name as oname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_organization_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_funder stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
join SOURCE.result_projects rp on rp.id=r.id
|
||||||
|
join SOURCE.project p on p.id=rp.project
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
|
||||||
|
|
||||||
|
create table TARGET.result_affiliated_funder_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_organization ro on ro.id=r.id
|
||||||
|
join SOURCE.organization o on o.id=ro.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
join SOURCE.result_projects rp on rp.id=r.id
|
||||||
|
join SOURCE.project p on p.id=rp.project
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_year stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_year_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_datasource stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, d.name as dname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_datasource_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_organization stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_organization_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_funder stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, p.funder as pfunder
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
join SOURCE.result_projects rp on rp.id=r.id
|
||||||
|
join SOURCE.project p on p.id=rp.project
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
|
||||||
|
|
||||||
|
create table TARGET.result_deposited_funder_country stored as parquet as
|
||||||
|
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||||
|
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||||
|
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
|
||||||
|
from SOURCE.result r
|
||||||
|
join SOURCE.result_datasources rd on rd.id=r.id
|
||||||
|
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||||
|
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||||
|
join SOURCE.organization o on o.id=dor.organization
|
||||||
|
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||||
|
join SOURCE.result_projects rp on rp.id=r.id
|
||||||
|
join SOURCE.project p on p.id=rp.project
|
||||||
|
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||||
|
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||||
|
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
|
||||||
|
|
||||||
|
compute stats TARGET.result_affiliated_country;
|
||||||
|
compute stats TARGET.result_affiliated_year;
|
||||||
|
compute stats TARGET.result_affiliated_year_country;
|
||||||
|
compute stats TARGET.result_affiliated_datasource;
|
||||||
|
compute stats TARGET.result_affiliated_datasource_country;
|
||||||
|
compute stats TARGET.result_affiliated_organization;
|
||||||
|
compute stats TARGET.result_affiliated_organization_country;
|
||||||
|
compute stats TARGET.result_affiliated_funder;
|
||||||
|
compute stats TARGET.result_affiliated_funder_country;
|
||||||
|
compute stats TARGET.result_deposited_country;
|
||||||
|
compute stats TARGET.result_deposited_year;
|
||||||
|
compute stats TARGET.result_deposited_year_country;
|
||||||
|
compute stats TARGET.result_deposited_datasource;
|
||||||
|
compute stats TARGET.result_deposited_datasource_country;
|
||||||
|
compute stats TARGET.result_deposited_organization;
|
||||||
|
compute stats TARGET.result_deposited_organization_country;
|
||||||
|
compute stats TARGET.result_deposited_funder;
|
||||||
|
compute stats TARGET.result_deposited_funder_country;
|
|
@ -25,6 +25,14 @@
|
||||||
<name>monitor_db_shadow_name</name>
|
<name>monitor_db_shadow_name</name>
|
||||||
<description>the name of the shadow monitor db</description>
|
<description>the name of the shadow monitor db</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>observatory_db_name</name>
|
||||||
|
<description>the target monitor db name</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>observatory_db_shadow_name</name>
|
||||||
|
<description>the name of the shadow monitor db</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>stats_tool_api_url</name>
|
<name>stats_tool_api_url</name>
|
||||||
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
|
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
|
||||||
|
@ -305,11 +313,26 @@
|
||||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
|
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
|
||||||
<file>monitor.sh</file>
|
<file>monitor.sh</file>
|
||||||
</shell>
|
</shell>
|
||||||
<ok to="Step21"/>
|
<ok to="step21-createObservatoryDB"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="Step21">
|
<action name="step21-createObservatoryDB">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>observatory.sh</exec>
|
||||||
|
<argument>${stats_db_name}</argument>
|
||||||
|
<argument>${observatory_db_name}</argument>
|
||||||
|
<argument>${observatory_db_shadow_name}</argument>
|
||||||
|
<argument>${wf:appPath()}/scripts/step21-createObservatoryDB.sql</argument>
|
||||||
|
<file>observatory.sh</file>
|
||||||
|
</shell>
|
||||||
|
<ok to="Step22"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="Step22">
|
||||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
|
|
8
pom.xml
8
pom.xml
|
@ -200,6 +200,12 @@
|
||||||
<version>${dhp.commons.lang.version}</version>
|
<version>${dhp.commons.lang.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.7</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
|
@ -730,7 +736,7 @@
|
||||||
<mockito-core.version>3.3.3</mockito-core.version>
|
<mockito-core.version>3.3.3</mockito-core.version>
|
||||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
<vtd.version>[2.12,3.0)</vtd.version>
|
<vtd.version>[2.12,3.0)</vtd.version>
|
||||||
<dhp-schemas.version>[2.4.7]</dhp-schemas.version>
|
<dhp-schemas.version>[2.5.11]</dhp-schemas.version>
|
||||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||||
|
|
Loading…
Reference in New Issue