merge branch with beta

This commit is contained in:
Miriam Baglioni 2021-07-30 11:58:29 +02:00
commit 1d6ac3715b
47 changed files with 963 additions and 383 deletions

View File

@ -25,6 +25,11 @@
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
</dependency> </dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>

View File

@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException; import java.time.format.DateTimeParseException;
import java.util.*; import java.util.*;
import java.util.function.Function; import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.github.sisyphsu.dateparser.DateParserUtils; import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode;
public class GraphCleaningFunctions extends CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter( .filter(
sp -> sp sp -> {
final String title = sp
.getValue() .getValue()
.toLowerCase() .toLowerCase();
.replaceAll(TITLE_FILTER_REGEX, "") final String residual = Unidecode
.length() > TITLE_FILTER_RESIDUAL_LENGTH) .decode(title)
.replaceAll(TITLE_FILTER_REGEX, "");
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
})
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }

View File

@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException; import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import me.xuender.unidecode.Unidecode;
public class OafMapperUtilsTest { public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testUnidecode() {
assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
}
@Test @Test
public void testDateValidation() { public void testDateValidation() {

View File

@ -64,9 +64,9 @@ abstract class AbstractRestClient extends Iterator[String]{
.setSocketTimeout(timeout * 1000).build() .setSocketTimeout(timeout * 1000).build()
val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
var tries = 4 var tries = 4
try {
while (tries > 0) { while (tries > 0) {
println(s"requesting ${r.getURI}") println(s"requesting ${r.getURI}")
try {
val response = client.execute(r) val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
@ -74,16 +74,14 @@ abstract class AbstractRestClient extends Iterator[String]{
} }
else else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent)
}
""
} catch { } catch {
case e: Throwable => case e: Throwable =>
throw new RuntimeException("Error on executing request ", e) println(s"Error on requesting ${r.getURI}")
} finally try client.close() e.printStackTrace()
catch { tries-=1
case e: IOException =>
throw new RuntimeException("Unable to close client ", e)
} }
} }
""
}
getBufferData() getBufferData()
} }

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.actionmanager.scholix
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkCreateActionset {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val workingDirFolder = parser.get("workingDirFolder")
log.info(s"workingDirFolder -> $workingDirFolder")
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
log.info("extract source and target Identifier involved in relations")
log.info("save relation filtered")
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
log.info("saving entities")
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
entities
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
}
}

View File

@ -1,9 +1,9 @@
package eu.dnetlib.dhp.sx.provision package eu.dnetlib.dhp.actionmanager.scholix
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
import org.apache.hadoop.io.Text import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.hadoop.mapred.SequenceFileOutputFormat

View File

@ -14,7 +14,7 @@
</property> </property>
</parameters> </parameters>
<start to="ExportDataset"/> <start to="createActionSet"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -25,7 +25,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Action Set</name> <name>Create Action Set</name>
<class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class> <class>eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
@ -42,7 +42,7 @@
<arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg> <arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="End"/> <ok to="SaveActionSet"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -52,7 +52,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Save Action Set</name> <name>Save Action Set</name>
<class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class> <class>eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset { public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isReferencedBy"); return relType.equals(ModelConstants.IS_REFERENCED_BY);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset { public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isRelatedTo"); return relType.equals(ModelConstants.IS_RELATED_TO);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset { public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedBy"); return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset { public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedTo"); return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset { public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
@ -11,7 +12,7 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("references"); return relType.equals(ModelConstants.REFERENCES);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication { public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isReferencedBy"); return relType.equals(ModelConstants.IS_REFERENCED_BY);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication { public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isRelatedTo"); return relType.equals(ModelConstants.IS_RELATED_TO);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication { public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedBy"); return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication { public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedTo"); return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
} }
} }

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication { public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
@ -11,7 +12,7 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
@Override @Override
protected boolean filterByType(final String relType) { protected boolean filterByType(final String relType) {
return relType.equals("references"); return relType.equals(ModelConstants.REFERENCES);
} }
} }

View File

@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
public class ClusterUtils { public class ClusterUtils {
@ -52,15 +53,15 @@ public class ClusterUtils {
} }
public static boolean isDedupRoot(final String id) { public static boolean isDedupRoot(final String id) {
return id.contains("dedup_wf_"); return id.contains("dedup");
} }
public static final boolean isValidResultResultClass(final String s) { public static final boolean isValidResultResultClass(final String s) {
return s.equals("isReferencedBy") return s.equals(ModelConstants.IS_REFERENCED_BY)
|| s.equals("isRelatedTo") || s.equals(ModelConstants.IS_RELATED_TO)
|| s.equals("references") || s.equals(ModelConstants.REFERENCES)
|| s.equals("isSupplementedBy") || s.equals(ModelConstants.IS_SUPPLEMENTED_BY)
|| s.equals("isSupplementedTo"); || s.equals(ModelConstants.IS_SUPPLEMENT_TO);
} }
public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) { public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) {

View File

@ -179,20 +179,6 @@ object DoiBoostMappingUtil {
} }
//val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
// val pub_date = LocalDate.parse(date, formatter)
// if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
// val oaq : AccessRight = getOpenAccessQualifier()
// oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
// return oaq
// }
// else{
// return getEmbargoedAccessQualifier()
// }
} }
return getClosedAccessQualifier() return getClosedAccessQualifier()
@ -202,7 +188,8 @@ object DoiBoostMappingUtil {
def getOpenAccessQualifier():AccessRight = { def getOpenAccessQualifier():AccessRight = {
OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
} }
def getRestrictedQualifier():AccessRight = { def getRestrictedQualifier():AccessRight = {
@ -211,7 +198,7 @@ object DoiBoostMappingUtil {
def getUnknownQualifier():AccessRight = { def getUnknownQualifier():AccessRight = {
OafMapperUtils.accessRight("UNKNOWN","not available",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
} }
@ -251,8 +238,7 @@ object DoiBoostMappingUtil {
i.setAccessright(getOpenAccessQualifier()) i.setAccessright(getOpenAccessQualifier())
i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
} }
// val ar = getOpenAccessQualifier()
// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
} }
else { else {
hb = ModelConstants.UNKNOWN_REPOSITORY hb = ModelConstants.UNKNOWN_REPOSITORY
@ -261,17 +247,7 @@ object DoiBoostMappingUtil {
}) })
publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance())) publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance()))
// val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
// if (ar.nonEmpty) {
// if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
// val ar = getOpenAccessQualifier()
// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
// }
// else {
// val ar = getRestrictedQualifier()
// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
// }
// }
publication publication
} }

View File

@ -0,0 +1,115 @@
package eu.dnetlib.dhp.oa.graph.raw;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.FileNotFoundException;
import java.util.Objects;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class PatchRelationsApplication {
private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Optional.ofNullable(
PatchRelationsApplication.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
.orElseThrow(FileNotFoundException::new)
));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphBasePath = parser.get("graphBasePath");
log.info("graphBasePath: {}", graphBasePath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String idMappingPath = parser.get("idMappingPath");
log.info("idMappingPath: {}", idMappingPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
}
/**
* Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
* mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
*
* @param spark the SparkSession
* @param graphBasePath base graph path providing the set of relations to patch
* @param workingDir intermediate storage location
* @param idMappingPath dataset providing the old -> new identifier mapping
*/
private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir, final String idMappingPath) {
final String relationPath = graphBasePath + "/relation";
final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
rels
.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
final Relation r = t._1();
Optional.ofNullable(t._2())
.map(RelationIdMapping::getNewId)
.ifPresent(r::setSource);
return r;
}, Encoders.bean(Relation.class))
.joinWith(idMapping, rels.col("target").equalTo(idMapping.col("oldId")), "left")
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
final Relation r = t._1();
Optional.ofNullable(t._2())
.map(RelationIdMapping::getNewId)
.ifPresent(r::setTarget);
return r;
}, Encoders.bean(Relation.class))
.map(
(MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(workingDir);
spark.read().textFile(workingDir)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(relationPath);
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
public class RelationIdMapping {
private String oldId;
private String newId;
public String getOldId() {
return oldId;
}
public void setOldId(final String oldId) {
this.oldId = oldId;
}
public String getNewId() {
return newId;
}
public void setNewId(final String newId) {
this.newId = newId;
}
}

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "g",
"paramLongName": "graphBasePath",
"paramDescription": "base graph path providing the set of relations to patch",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDir",
"paramDescription": "intermediate storage location",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "idMappingPath",
"paramDescription": "dataset providing the old -> new identifier mapping",
"paramRequired": true
}
]

View File

@ -100,6 +100,16 @@
<value></value> <value></value>
<description>a blacklist of nsprefixes (comma separeted)</description> <description>a blacklist of nsprefixes (comma separeted)</description>
</property> </property>
<property>
<name>shouldPatchRelations</name>
<value>false</value>
<description>activates the relation patching phase, driven by the content in ${idMappingPath}</description>
</property>
<property>
<name>idMappingPath</name>
<value></value>
<description>path pointing to the relations identifiers mapping dataset</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -551,7 +561,6 @@
<path start="merge_claims_relation"/> <path start="merge_claims_relation"/>
</fork> </fork>
<action name="merge_claims_publication"> <action name="merge_claims_publication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -760,7 +769,42 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait_merge" to="End"/> <join name="wait_merge" to="decisionPatchRelations"/>
<decision name="decisionPatchRelations">
<switch>
<case to="patchRelations">
${(shouldPatchRelations eq "true") and
(fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
</case>
<default to="End"/>
</switch>
</decision>
<action name="patchRelations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PatchRelations</name>
<class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--graphBasePath</arg><arg>${graphOutputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
<arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
@ -343,7 +344,6 @@ public class MappersTest {
assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739"))); assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739")));
// assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0)); // assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(p.getAuthor().size() > 0); assertTrue(p.getAuthor().size() > 0);
@ -559,6 +559,31 @@ public class MappersTest {
assertNotNull(d.getInstance().get(0).getUrl()); assertNotNull(d.getInstance().get(0).getUrl());
} }
@Test
void testEnermaps() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Dataset);
final Dataset d = (Dataset) list.get(0);
assertValidId(d.getId());
assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
assertEquals(1, d.getAuthor().size());
assertEquals(1, d.getInstance().size());
assertNotNull(d.getInstance().get(0).getUrl());
assertNotNull(d.getContext());
assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId()));
assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId());
}
@Test @Test
void testClaimFromCrossref() throws IOException { void testClaimFromCrossref() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
@ -640,6 +665,30 @@ public class MappersTest {
System.out.println(p.getTitle().get(0).getValue()); System.out.println(p.getTitle().get(0).getValue());
} }
@Test
void testJairo() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertTrue(p.getTitle().size() == 1);
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
final Publication p_cleaned = cleanup(fixVocabularyNames(p));
assertNotNull(p_cleaned.getTitle());
assertFalse(p_cleaned.getTitle().isEmpty());
}
@Test @Test
void testOdfFromHdfs() throws IOException { void testOdfFromHdfs() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:oaf="http://namespace.openaire.eu/oaf">
<oai:header xmlns="http://namespace.openaire.eu/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dri:objIdentifier>enermaps____::04149ee428d07360314c2cb3ba95d41e</dri:objIdentifier>
<dri:recordIdentifier>tgs00004</dri:recordIdentifier>
<dri:dateOfCollection>2021-07-20T18:43:12.096+02:00</dri:dateOfCollection>
<oaf:datasourceprefix>enermaps____</oaf:datasourceprefix>
</oai:header>
<metadata>
<resource>
<identifier identifierType="URL">https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004</identifier>
<creators>
<creator>
<creatorName>Statistical Office of the European Union (Eurostat)</creatorName>
</creator>
</creators>
<titles>
<title>
Regional GDP
</title>
</titles>
<publisher>Statistical Office of the European Union (Eurostat)</publisher>
<publicationYear>2020</publicationYear>
<dates>
<date dateType="Issued">2020-10-07</date>
</dates>
<resourceType resourceTypeGeneral="Dataset"/>
<rightsList>
<rights rightsURI="info:eu-repo/semantics/openAccess">OPEN</rights>
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract" xml:lang="EN">GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.</description>
</descriptions>
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
<oaf:dateAccepted>2020-10-07</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>Creative Commons Attribution 4.0 International</oaf:license>
<oaf:hostedBy
id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
<oaf:collectedFrom id="enermaps____::db" name="Enermaps"/>
<oaf:concept id="enermaps::selection::tgs00004"/>
</resource>
</metadata>
<about xmlns="" xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-07-20T18:43:12.096+02:00">
<baseURL>https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite</baseURL>
<identifier/>
<datestamp/>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk"
classname="sysimport:crosswalk"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
<dc:creator>上原, 剛</dc:creator>
<dc:creator>甲斐, 博</dc:creator>
<dc:creator>野田, 松太郎</dc:creator>
<dc:format>application/pdf</dc:format>
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
<dc:language>jpn</dc:language>
<dc:publisher>京都大学数理解析研究所</dc:publisher>
<dc:subject classid="ndc" classname="ndc"
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
<dc:type>Departmental Bulletin Paper</dc:type>
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
<oaf:projectid/>
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
</metadata>
<about>
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
<datestamp>2021-04-13T13:36:29Z</datestamp>
<metadataNamespace/>
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
<datestamp>2012-07-12T14:15:41Z</datestamp>
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
</originDescription>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -9,6 +9,41 @@
<artifactId>dhp-graph-provision</artifactId> <artifactId>dhp-graph-provision</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<args>
<arg>-Xmax-classfile-name</arg>
<arg>200</arg>
</args>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
<dependency> <dependency>

View File

@ -10,6 +10,7 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -81,6 +82,7 @@ public class PrepareRelationsJob {
Set<String> relationFilter = Optional Set<String> relationFilter = Optional
.ofNullable(parser.get("relationFilter")) .ofNullable(parser.get("relationFilter"))
.map(String::toLowerCase)
.map(s -> Sets.newHashSet(Splitter.on(",").split(s))) .map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
.orElse(new HashSet<>()); .orElse(new HashSet<>());
log.info("relationFilter: {}", relationFilter); log.info("relationFilter: {}", relationFilter);
@ -130,7 +132,7 @@ public class PrepareRelationsJob {
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath) JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false); .filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false);
JavaRDD<Relation> pruned = pruneRels( JavaRDD<Relation> pruned = pruneRels(
pruneRels( pruneRels(

View File

@ -71,6 +71,9 @@ public class DropAndCreateESIndex {
log.info(STATUS_CODE_TEXT, response.getStatusLine()); log.info(STATUS_CODE_TEXT, response.getStatusLine());
} }
log.info("Sleeping 60 seconds to avoid to lost the creation of index request");
Thread.sleep(60000);
try (CloseableHttpClient client = HttpClients.createDefault()) { try (CloseableHttpClient client = HttpClients.createDefault()) {
final String summaryConf = IOUtils final String summaryConf = IOUtils

View File

@ -1,90 +0,0 @@
package eu.dnetlib.dhp.sx.provision
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import org.apache.spark.{SparkConf, sql}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkCreateActionset {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val workingDirFolder = parser.get("workingDirFolder")
log.info(s"workingDirFolder -> $workingDirFolder")
implicit val oafEncoders:Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resultEncoders:Encoder[Result] = Encoders.kryo[Result]
implicit val relationEncoders:Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.flatMap(r => List(r.getSource,r.getTarget)).distinct().write.save(s"$workingDirFolder/id_relation")
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
log.info("extract source and target Identifier involved in relations")
log.info("save relation filtered")
relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
log.info("saving publication")
val publication:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/publication").as[Result].map(p => (p.getId, p))
publication
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving dataset")
val dataset:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/dataset").as[Result].map(p => (p.getId, p))
dataset
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving software")
val software:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/software").as[Result].map(p => (p.getId, p))
software
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
log.info("saving Other Research product")
val orp:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/otherresearchproduct").as[Result].map(p => (p.getId, p))
orp
.joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
}
}

View File

@ -21,8 +21,10 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
@ -131,4 +133,32 @@ public class XmlRecordFactoryTest {
System.out.println(doc.asXML()); System.out.println(doc.asXML());
assertEquals("", doc.valueOf("//rel/validated")); assertEquals("", doc.valueOf("//rel/validated"));
} }
@Test
public void testEnermapsRecord() throws IOException, DocumentException {
String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
+
"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
"</entries>";
ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
otherDsTypeId);
Dataset d = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
JoinedEntity je = new JoinedEntity<>(d);
String xml = xmlRecordFactory.build(je);
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id"));
}
} }

File diff suppressed because one or more lines are too long

View File

@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH
hdfs dfs -copyToLocal $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH
echo "Creating indicators" echo "Creating indicators"
impala-shell -d ${TARGET} -q "invalidate metadata" impala-shell -q "invalidate metadata"
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f -
cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f - cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f -
echo "Indicators created" echo "Indicators created"

View File

@ -57,12 +57,14 @@ UNION ALL
SELECT * FROM ${stats_db_name}.software_sources SELECT * FROM ${stats_db_name}.software_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
--
-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS; create table ${stats_db_name}.result_orcid as
-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS; select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
-- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS; from (
-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS; SELECT substr(res.id, 4) as id, auth_pid.value as orcid
-- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS; FROM ${openaire_db_name}.result res
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS; LATERAL VIEW explode(author) a as auth
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS; LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res

View File

@ -34,12 +34,3 @@ union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; select * from ${stats_db_name}.otherresearchproduct_refereed;
--
-- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS;

View File

@ -40,3 +40,197 @@ join result_instance ri on ri.id = p.id
join datasource on datasource.id = ri.hostedby join datasource on datasource.id = ri.hostedby
where datasource.id like '%doajarticles%') tmp where datasource.id like '%doajarticles%') tmp
on p.id= tmp.id; on p.id= tmp.id;
create table indi_project_pubs_count stored as parquet as
select pr.id id, count(p.id) total_pubs from project_results pr
join publication p on p.id=pr.result
group by pr.id;
create table indi_project_datasets_count stored as parquet as
select pr.id id, count(d.id) total_datasets from project_results pr
join dataset d on d.id=pr.result
group by pr.id;
create table indi_project_software_count stored as parquet as
select pr.id id, count(s.id) total_software from project_results pr
join software s on s.id=pr.result
group by pr.id;
create table indi_project_otherresearch_count stored as parquet as
select pr.id id, count(o.id) total_other from project_results pr
join otherresearchproduct o on o.id=pr.result
group by pr.id;
create table indi_pub_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from
(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1
ELSE 0
END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1
ELSE 0
END) AS NonOpenAccess
FROM publication p
join result_organization ro on p.id=ro.id
join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp;
create table indi_dataset_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from
(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1
ELSE 0
END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1
ELSE 0
END) AS NonOpenAccess
FROM dataset d
join result_organization ro on d.id=ro.id
join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp;
create table indi_software_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from
(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1
ELSE 0
END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1
ELSE 0
END) AS NonOpenAccess
FROM software s
join result_organization ro on s.id=ro.id
join SOURCER.organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp;
create table indi_other_avg_year_country_oa stored as parquet as
select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
from
(SELECT year, country, SUM(CASE
WHEN bestlicence='Open Access' THEN 1
ELSE 0
END) AS OpenAccess, SUM(CASE
WHEN bestlicence<>'Open Access' THEN 1
ELSE 0
END) AS NonOpenAccess
FROM otherresearchproduct orp
join result_organization ro on orp.id=ro.id
join organization o on o.id=ro.organization
where cast(year as int)>=2003 and cast(year as int)<=2021
group by year, country) tmp;
create table indi_pub_avg_year_context_oa stored as parquet as
with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
join context c on pc.concept like concat('%',c.id,'%')
join publication p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofpubs
from total;
create table indi_dataset_avg_year_context_oa stored as parquet as
with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
join context c on pc.concept like concat('%',c.id,'%')
join dataset p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofdataset
from total;
create table indi_software_avg_year_context_oa stored as parquet as
with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
join context c on pc.concept like concat('%',c.id,'%')
join software p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofsoftware
from total;
create table indi_other_avg_year_context_oa stored as parquet as
with total as
(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
join context c on pc.concept like concat('%',c.id,'%')
join otherresearchproduct p on p.id=pc.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by c.name, year )
select year, name, round(no_of_pubs/total*100,3) averageofother
from total;
create table indi_other_avg_year_content_oa stored as parquet as
with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from otherresearchproduct_datasources pd
join datasource d on datasource=d.id
join otherresearchproduct p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
from total;
create table indi_software_avg_year_content_oa stored as parquet as
with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from software_datasources pd
join datasource d on datasource=d.id
join software p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
from total;
create table indi_dataset_avg_year_content_oa stored as parquet as
with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from dataset_datasources pd
join datasource d on datasource=d.id
join dataset p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
from total;
create table indi_pub_avg_year_content_oa stored as parquet as
with total as
(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
from publication_datasources pd
join datasource d on datasource=d.id
join publication p on p.id=pd.id
where cast(year as int)>=2003 and cast(year as int)<=2021
group by d.type, year)
select year, type, round(no_of_pubs/total*100,3) averageOfPubs
from total;
create table indi_pub_has_cc_licence stored as parquet as
select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from publication p
left outer join (select p.id, license.type as lic from publication p
join publication_licenses as license on license.id = p.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on p.id= tmp.id;
create table indi_pub_has_cc_licence_url stored as parquet as
select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url
from publication p
left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host
from publication p
join publication_licenses as license on license.id = p.id
WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
on p.id= tmp.id;
create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication;

View File

@ -90,27 +90,8 @@ FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_citations AS CREATE TABLE ${stats_db_name}.publication_citations AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false; and p.datainfo.deletedbyinference = false;
-- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS;

View File

@ -41,7 +41,7 @@ FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE; WHERE d.datainfo.deletedbyinference = FALSE;
CREATE TABLE ${stats_db_name}.dataset_citations AS CREATE TABLE ${stats_db_name}.dataset_citations AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -96,20 +96,3 @@ SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subj
FROM ${openaire_db_name}.dataset p FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
--
-- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS;

View File

@ -41,7 +41,7 @@ from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false; where s.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_citations AS CREATE TABLE ${stats_db_name}.software_citations AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@ -96,20 +96,3 @@ SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subj
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
--
-- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS;

View File

@ -41,7 +41,7 @@ WHERE o.datainfo.deletedbyinference = FALSE;
-- Otherresearchproduct_citations -- Otherresearchproduct_citations
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false; and o.datainfo.deletedbyinference = false;
@ -87,20 +87,3 @@ CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS;

View File

@ -13,11 +13,17 @@ WHERE r.reltype = 'projectOrganization'
and r.datainfo.deletedbyinference = false; and r.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.project_results AS CREATE TABLE ${stats_db_name}.project_results AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' WHERE r.reltype = 'resultProject'
and r.datainfo.deletedbyinference = false; and r.datainfo.deletedbyinference = false;
create table ${stats_db_name}.project_classification as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and class.h2020programme is not null;
CREATE TABLE ${stats_db_name}.project_tmp CREATE TABLE ${stats_db_name}.project_tmp
( (
id STRING, id STRING,

View File

@ -130,12 +130,7 @@ WHERE r.reltype = 'resultOrganization'
and r.datainfo.deletedbyinference = false; and r.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.result_projects AS CREATE TABLE ${stats_db_name}.result_projects AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
-- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS;

View File

@ -17,7 +17,9 @@ CREATE TABLE ${stats_db_name}.datasource_tmp
`latitude` STRING, `latitude` STRING,
`longitude` STRING, `longitude` STRING,
`websiteurl` STRING, `websiteurl` STRING,
`compatibility` STRING `compatibility` STRING,
issn_printed STRING,
issn_online STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
-- Insert statement that takes into account the piwik_id of the openAIRE graph -- Insert statement that takes into account the piwik_id of the openAIRE graph
@ -32,7 +34,9 @@ SELECT substr(d1.id, 4) AS id,
d1.latitude.value AS latitude, d1.latitude.value AS latitude,
d1.longitude.value AS longitude, d1.longitude.value AS longitude,
d1.websiteurl.value AS websiteurl, d1.websiteurl.value AS websiteurl,
d1.openairecompatibility.classid AS compatibility d1.openairecompatibility.classid AS compatibility,
d1.journal.issnprinted AS issn_printed,
d1.journal.issnonline AS issn_online
FROM ${openaire_db_name}.datasource d1 FROM ${openaire_db_name}.datasource d1
LEFT OUTER JOIN LEFT OUTER JOIN
(SELECT id, split(originalidd, '\\:')[1] as piwik_id (SELECT id, split(originalidd, '\\:')[1] as piwik_id
@ -51,7 +55,7 @@ CREATE TABLE ${stats_db_name}.dual
INSERT INTO ${stats_db_name}.dual INSERT INTO ${stats_db_name}.dual
VALUES ('X'); VALUES ('X');
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`) `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
SELECT 'other', SELECT 'other',
'Other', 'Other',
'Repository', 'Repository',
@ -62,7 +66,9 @@ SELECT 'other',
NULL, NULL,
NULL, NULL,
NULL, NULL,
'unknown' 'unknown',
null,
null
FROM ${stats_db_name}.dual FROM ${stats_db_name}.dual
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
DROP TABLE ${stats_db_name}.dual; DROP TABLE ${stats_db_name}.dual;
@ -98,12 +104,3 @@ where d.datainfo.deletedbyinference = false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; FROM ${stats_db_name}.result_datasources;
-- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS;

View File

@ -205,6 +205,11 @@
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
<version>1.0.7</version> <version>1.0.7</version>
</dependency> </dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
<version>0.0.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>