merged from beta

This commit is contained in:
Claudio Atzori 2022-04-22 11:55:25 +02:00
commit 888f2de196
208 changed files with 8330 additions and 1242 deletions

21
.scalafmt.conf Normal file
View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<packaging>jar</packaging>
@ -47,12 +47,16 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.common.collection;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class DecompressTarGz {
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(inputFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
try (
FSDataOutputStream out = fs
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
}
}
}
}
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.dedup;
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -38,7 +38,7 @@ public class DispatchEntitiesSparkJob {
.requireNonNull(
DispatchEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.dedup;
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
@ -53,7 +53,7 @@ public class GroupEntitiesSparkJob {
.toString(
GroupEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -47,6 +47,17 @@ public class OafMapperUtils {
}
public static Result mergeResults(Result left, Result right) {
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
return left;
}
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
return right;
}
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
@ -56,6 +67,18 @@ public class OafMapperUtils {
}
}
private static boolean isFromDelegatedAuthority(Result r) {
return Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
.map(i -> i.getCollectedfrom().getKey())
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
.orElse(false);
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
@ -368,4 +391,19 @@ public class OafMapperUtils {
}
return null;
}
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
KeyValue kv = new KeyValue();
kv.setDataInfo(dataInfo);
kv.setKey(key);
kv.setValue(value);
return kv;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
return m;
}
}

View File

@ -185,6 +185,22 @@ class OafMapperUtilsTest {
.getClassid());
}
@Test
void testDelegatedAuthority() throws IOException {
Dataset d1 = read("dataset_2.json", Dataset.class);
Dataset d2 = read("dataset_delegated.json", Dataset.class);
assertEquals(1, d2.getCollectedfrom().size());
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
Result res = OafMapperUtils.mergeResults(d1, d2);
assertEquals(d2, res);
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}

View File

@ -1 +1,140 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -0,0 +1,140 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-actionmanager</artifactId>

View File

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<build>

View File

@ -27,6 +27,8 @@ public class Constants {
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
public static final String UPDATE_KEY_USAGE_COUNTS = "count";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";

View File

@ -21,8 +21,10 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareFOSSparkJob implements Serializable {
@ -71,6 +73,7 @@ public class PrepareFOSSparkJob implements Serializable {
Result r = new Result();
FOSDataModel first = it.next();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
HashSet<String> level1 = new HashSet<>();
HashSet<String> level2 = new HashSet<>();
HashSet<String> level3 = new HashSet<>();
@ -81,6 +84,19 @@ public class PrepareFOSSparkJob implements Serializable {
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()

View File

@ -21,8 +21,10 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareSDGSparkJob implements Serializable {
@ -78,6 +80,19 @@ public class PrepareSDGSparkJob implements Serializable {
s -> sbjs
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs);
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()

View File

@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -67,7 +68,19 @@ public class SparkSaveUnresolved implements Serializable {
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> ret.mergeFrom(r));
it.forEachRemaining(r -> {
if (r.getInstance() != null) {
ret.setInstance(r.getInstance());
}
if (r.getSubject() != null) {
if (ret.getSubject() != null)
ret.getSubject().addAll(r.getSubject());
else
ret.setSubject(r.getSubject());
}
// ret.mergeFrom(r)
});
return ret;
}, Encoders.bean(Result.class))
.write()

View File

@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.read()
.textFile(inputPath + "/*")
.map(
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
Encoders.bean(COCI.class))
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.toJavaRDD()
@ -98,26 +103,30 @@ public class CreateActionSetSparkJob implements Serializable {
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
private static List<Relation> createRelation(COCI value, boolean duplicate) {
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
String citing = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
final String cited = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
relationList
.addAll(
getRelations(
citing,
cited));
if (!citing.equals(cited)) {
relationList
.addAll(
getRelations(
citing,
cited));
if (duplicate && line[1].endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
if (duplicate && value.getCiting().endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(
CleaningFunctions
.normalizePidValue(
"doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
}
}
return relationList;

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class ReadCOCI implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
ReadCOCI.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
SparkConf sconf = new SparkConf();
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
runWithSparkSession(
sconf,
isSparkSessionManaged,
spark -> {
doRead(
spark,
workingPath,
inputFile,
outputPath,
delimiter);
});
}
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
String outputPath,
String delimiter) throws IOException {
for (String inputFile : inputFiles) {
String p_string = workingPath + "/" + inputFile + ".gz";
Dataset<Row> cociData = spark
.read()
.format("csv")
.option("sep", delimiter)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(p_string)
.repartition(100);
cociData.map((MapFunction<Row, COCI>) row -> {
COCI coci = new COCI();
coci.setOci(row.getString(0));
coci.setCiting(row.getString(1));
coci.setCited(row.getString(2));
return coci;
}, Encoders.bean(COCI.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + inputFile);
}
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.actionmanager.opencitations.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class COCI implements Serializable {
private String oci;
private String citing;
private String cited;
public String getOci() {
return oci;
}
public void setOci(String oci) {
this.oci = oci;
}
public String getCiting() {
return citing;
}
public void setCiting(String citing) {
this.citing = citing;
}
public String getCited() {
return cited;
}
public void setCited(String cited) {
this.cited = cited;
}
}

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
@ -29,8 +30,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -38,8 +38,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
import eu.dnetlib.dhp.actionmanager.ror.model.Relationship;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
@ -48,8 +48,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
@ -112,24 +114,21 @@ public class GenerateRorActionSetJob {
final String outputPath) throws IOException {
readInputPath(spark, inputPath)
.map(
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
Encoders.bean(Organization.class))
.toJavaRDD()
.map(o -> new AtomicAction<>(Organization.class, o))
.map(GenerateRorActionSetJob::convertRorOrg)
.flatMap(List::iterator)
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
protected static Organization convertRorOrg(final RorOrganization r) {
protected static List<AtomicAction<? extends Oaf>> convertRorOrg(final RorOrganization r) {
final Date now = new Date();
final Organization o = new Organization();
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
o.setId(calculateOpenaireId(r.getId()));
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
o.setCollectedfrom(ROR_COLLECTED_FROM);
o.setPid(pids(r));
@ -166,7 +165,43 @@ public class GenerateRorActionSetJob {
o.setDataInfo(ROR_DATA_INFO);
o.setLastupdatetimestamp(now.getTime());
return o;
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
res.add(new AtomicAction<>(Organization.class, o));
for (final Relationship rorRel : r.getRelationships()) {
if (rorRel.getType().equalsIgnoreCase("parent")) {
final String orgId1 = calculateOpenaireId(r.getId());
final String orgId2 = calculateOpenaireId(rorRel.getId());
res
.add(
new AtomicAction<>(Relation.class,
calculateHierarchyRel(orgId1, orgId2, ModelConstants.IS_PARENT_OF)));
res
.add(
new AtomicAction<>(Relation.class,
calculateHierarchyRel(orgId2, orgId1, ModelConstants.IS_CHILD_OF)));
}
}
return res;
}
private static Relation calculateHierarchyRel(final String source, final String target, final String relClass) {
final Relation rel = new Relation();
rel.setSource(source);
rel.setTarget(target);
rel.setRelType(ORG_ORG_RELTYPE);
rel.setSubRelType(ModelConstants.RELATIONSHIP);
rel.setRelClass(relClass);
rel.setCollectedfrom(ROR_COLLECTED_FROM);
rel.setDataInfo(ROR_DATA_INFO);
rel.setLastupdatetimestamp(System.currentTimeMillis());
return rel;
}
private static String calculateOpenaireId(final String rorId) {
return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId));
}
private static List<StructuredProperty> pids(final RorOrganization r) {
@ -202,14 +237,14 @@ public class GenerateRorActionSetJob {
.collect(Collectors.toList());
}
private static Dataset<RorOrganization> readInputPath(
private static JavaRDD<RorOrganization> readInputPath(
final SparkSession spark,
final String path) throws IOException {
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
final InputStream is = fileSystem.open(new Path(path))) {
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)).toJavaRDD();
}
}

View File

@ -0,0 +1,149 @@
package eu.dnetlib.dhp.actionmanager.usagestats;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
/**
* created the Atomic Action for each type of results
*/
public class SparkAtomicActionUsageJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionUsageJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final String dbname = parser.get("usagestatsdb");
final String workingPath = parser.get("workingPath");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareResults(dbname, spark, workingPath);
prepareActionSet(spark, workingPath, outputPath);
});
}
public static void prepareResults(String db, SparkSession spark, String workingPath) {
spark
.sql(
"Select result_id, downloads, views " +
"from " + db + ".usage_stats")
.as(Encoders.bean(UsageStatsModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
public static void prepareActionSet(SparkSession spark, String inputPath, String outputPath) {
readPath(spark, inputPath, UsageStatsModel.class)
.groupByKey((MapFunction<UsageStatsModel, String>) us -> us.getResult_id(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, UsageStatsModel, Result>) (k, it) -> {
UsageStatsModel first = it.next();
it.forEachRemaining(us -> {
first.setDownloads(first.getDownloads() + us.getDownloads());
first.setViews(first.getViews() + us.getViews());
});
Result res = new Result();
res.setId("50|" + k);
res.setMeasures(getMeasure(first.getDownloads(), first.getViews()));
return res;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static List<Measure> getMeasure(Long downloads, Long views) {
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"");
return Arrays
.asList(
OafMapperUtils
.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.usagestats;
import java.io.Serializable;
public class UsageStatsModel implements Serializable {
private String result_id;
private Long downloads;
private Long views;
public String getResult_id() {
return result_id;
}
public void setResult_id(String result_id) {
this.result_id = result_id;
}
public Long getDownloads() {
return downloads;
}
public void setDownloads(Long downloads) {
this.downloads = downloads;
}
public Long getViews() {
return views;
}
public void setViews(Long views) {
this.views = views;
}
}

View File

@ -0,0 +1,37 @@
[
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "d",
"paramLongName": "delimiter",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the hdfs name node",
"paramRequired": true
},
{
"paramName": "if",
"paramLongName": "inputFile",
"paramDescription": "the hdfs name node",
"paramRequired": true
}
]

View File

@ -26,6 +26,7 @@
<switch>
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
<case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
@ -60,6 +61,32 @@
<arg>--inputFile</arg><arg>${inputFile}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</java>
<ok to="read"/>
<error to="Kill"/>
</action>
<action name="read">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the AS for OC</name>
<class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
</spark>
<ok to="create_actionset"/>
<error to="Kill"/>
</action>
@ -81,7 +108,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "hmu",
"paramLongName": "hive_metastore_uris",
"paramDescription": "the URI for the hive metastore",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "sdb",
"paramLongName": "usagestatsdb",
"paramDescription": "the name of the db to be used",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the workingPath where to save the content of the usage_stats table",
"paramRequired": true
}
]

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,99 @@
<workflow-app name="UsageStatsCounts" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>usagestatsdb</name>
<description>the name of the db to be used</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="atomicactions"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="atomicactions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the usage stats count for results</name>
<class>eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
<arg>--workingPath</arg><arg>${workingDir}/usageDb</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,63 @@
from urllib.request import urlopen
import json
def retrieve_datacite_clients(base_url):
datacite_clients = {}
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
base_url = data['links']['next']
else:
base_url = None
return datacite_clients
def retrieve_r3data(start_url):
r3data_clients = {}
page_number = 1
base_url = start_url
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
r3data_clients[item['id'].lower()]= dict(
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
official_name=item['attributes']['repositoryName']
)
page_number +=1
base_url = f"{start_url}&page[number]={page_number}"
else:
base_url = None
return r3data_clients
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
dc = retrieve_datacite_clients(base_url)
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
result = {}
for item in dc:
res = dc[item].lower()
if res not in r3:
print(f"missing {res} for {item} in dictionary")
else:
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
json.dump(result, json_file, ensure_ascii=False, indent=1)

View File

@ -49,7 +49,7 @@ abstract class AbstractRestClient extends Iterator[String] {
}
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
val timeout = 60; // seconds
val timeout = 600; // seconds
val config = RequestConfig
.custom()
.setConnectTimeout(timeout * 1000)

View File

@ -46,7 +46,7 @@ object ImportDatacite {
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
"/eu/dnetlib/dhp/datacite/import_from_api.json"
)
)
.mkString

View File

@ -146,6 +146,11 @@ public class PrepareTest {
.get(0)
.getValue());
final String doi2 = "unresolved::10.3390/s18072310::doi";
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count());
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size());
}
@Test
@ -259,59 +264,61 @@ public class PrepareTest {
.collect()
.contains("8. Economic growth"));
}
@Test
void test3() throws Exception {
final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
final String outputPath = workingDir.toString() + "/fos.json";
GetFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", outputPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<FOSDataModel> tmp = sc
.textFile(outputPath)
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
Assertions.assertEquals(32, tmp.filter(row -> row.getDataInfo() != null).count());
}
@Test
void test4() throws Exception {
final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
final String outputPath = workingDir.toString() + "/sdg.json";
GetSDGSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", outputPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<SDGDataModel> tmp = sc
.textFile(outputPath)
.map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
}
// @Test
// void test3() throws Exception {
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
//
// final String outputPath = workingDir.toString() + "/fos.json";
// GetFOSSparkJob
// .main(
// new String[] {
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
// "--sourcePath", sourcePath,
//
// "-outputPath", outputPath
//
// });
//
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
//
// JavaRDD<FOSDataModel> tmp = sc
// .textFile(outputPath)
// .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
//
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
//
// }
//
// @Test
// void test4() throws Exception {
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
//
// final String outputPath = workingDir.toString() + "/sdg.json";
// GetSDGSparkJob
// .main(
// new String[] {
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
// "--sourcePath", sourcePath,
//
// "-outputPath", outputPath
//
// });
//
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
//
// JavaRDD<SDGDataModel> tmp = sc
// .textFile(outputPath)
// .map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
//
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
//
// }
}

View File

@ -196,6 +196,9 @@ public class ProduceTest {
final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD();
tmp
.filter(row -> row.getId().equals(doi))
.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
Assertions
.assertEquals(
3, tmp

View File

@ -76,7 +76,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -99,7 +99,7 @@ public class CreateOpenCitationsASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(60, tmp.count());
assertEquals(62, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
@ -110,7 +110,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -131,7 +131,7 @@ public class CreateOpenCitationsASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(44, tmp.count());
assertEquals(46, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
@ -142,7 +142,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -175,7 +175,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -215,7 +215,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -240,8 +240,8 @@ public class CreateOpenCitationsASTest {
assertEquals("citation", r.getSubRelType());
assertEquals("resultResult", r.getRelType());
});
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
}
@ -250,7 +250,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -295,7 +295,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob

View File

@ -0,0 +1,138 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ReadCOCITest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(ReadCOCITest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(ReadCOCITest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ReadCOCITest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(ReadCOCITest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testReadCOCI() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
ReadCOCI
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-workingPath",
workingDir.toString() + "/COCI",
"-outputPath",
workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4;input5"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<COCI> tmp = sc
.textFile(workingDir.toString() + "/COCI_json/*/")
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
Assertions.assertEquals(24, tmp.count());
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());
Assertions.assertEquals(8, tmp.filter(c -> c.getCiting().indexOf(".refs") > -1).count());
}
}

View File

@ -1,7 +1,10 @@
package eu.dnetlib.dhp.actionmanager.ror;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.FileInputStream;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@ -13,9 +16,12 @@ import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
@Disabled
class GenerateRorActionSetJobTest {
private static final ObjectMapper mapper = new ObjectMapper();
@ -30,21 +36,40 @@ class GenerateRorActionSetJobTest {
void testConvertRorOrg() throws Exception {
final RorOrganization r = mapper
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r);
Assertions.assertEquals(3, aas.size());
assertEquals(Organization.class, aas.get(0).getClazz());
assertEquals(Relation.class, aas.get(1).getClazz());
assertEquals(Relation.class, aas.get(2).getClazz());
final Organization o = (Organization) aas.get(0).getPayload();
final Relation r1 = (Relation) aas.get(1).getPayload();
final Relation r2 = (Relation) aas.get(2).getPayload();
assertEquals(o.getId(), r1.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertEquals(ModelConstants.IS_PARENT_OF, r1.getRelClass());
assertEquals(ModelConstants.IS_CHILD_OF, r2.getRelClass());
System.out.println(mapper.writeValueAsString(o));
System.out.println(mapper.writeValueAsString(r1));
System.out.println(mapper.writeValueAsString(r2));
final String s = mapper.writeValueAsString(org);
Assertions.assertTrue(StringUtils.isNotBlank(s));
System.out.println(s);
}
@Test
@Disabled
void testConvertAllRorOrg() throws Exception {
final RorOrganization[] arr = mapper
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
for (final RorOrganization r : arr) {
Organization o = GenerateRorActionSetJob.convertRorOrg(r);
Assertions.assertNotNull(o);
final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r);
Assertions.assertFalse(aas.isEmpty());
Assertions.assertNotNull(aas.get(0));
final Organization o = (Organization) aas.get(0).getPayload();
Assertions.assertTrue(StringUtils.isNotBlank(o.getId()));
}
}

View File

@ -0,0 +1,259 @@
package eu.dnetlib.dhp.actionmanager.usagestats;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Result;
public class SparkAtomicActionCountJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(SparkAtomicActionCountJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(SparkAtomicActionCountJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(SparkAtomicActionCountJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SparkAtomicActionCountJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testMatch() {
String usageScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/usagestats/usagestatsdb")
.getPath();
SparkAtomicActionUsageJob.prepareActionSet(spark, usageScoresPath, workingDir.toString() + "/actionSet");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/actionSet")
.map(usm -> OBJECT_MAPPER.readValue(usm, Result.class));
Assertions.assertEquals(9, tmp.count());
tmp.foreach(r -> Assertions.assertEquals(2, r.getMeasures().size()));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m
.getUnit()
.stream()
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getDeletedbyinference()))));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m.getUnit().stream().forEach(u -> Assertions.assertTrue(u.getDataInfo().getInferred()))));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m
.getUnit()
.stream()
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getInvisible()))));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m
.getUnit()
.stream()
.forEach(
u -> Assertions
.assertEquals(
"measure:usage_counts",
u.getDataInfo().getProvenanceaction().getClassid()))));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m
.getUnit()
.stream()
.forEach(
u -> Assertions
.assertEquals(
"Inferred by OpenAIRE",
u.getDataInfo().getProvenanceaction().getClassname()))));
tmp
.foreach(
r -> r
.getMeasures()
.stream()
.forEach(
m -> m
.getUnit()
.stream()
.forEach(
u -> Assertions
.assertEquals(
"count",
u.getKey()))));
Assertions
.assertEquals(
1, tmp.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6")).count());
Assertions
.assertEquals(
"0",
tmp
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("downloads"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"5",
tmp
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("views"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"0",
tmp
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("downloads"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"1",
tmp
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("views"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"2",
tmp
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("downloads"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"6",
tmp
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(m -> m.getId().equals("views"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
}
}

View File

@ -1,8 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
02001000007362801000805046300010563030608046333-0200101010136193701050501630209010637020000083700020400083733,10.1007/s10854-015-3684-x,10.1111/j.1551-2916.2008.02408.x,2015-09-01,P7Y2M,no,no
02001000007362801000805046300010563030608046333-02001000007362801000805046300010463020101046309,10.1007/s10854-015-3684-x,10.1007/s10854-014-2114-9,2015-09-01,P1Y2M4D,yes,no
02001000007362801000805046300010563030608046333-020010001063619371214271022182329370200010337000937000609,10.1007/s10854-015-3684-x,10.1016/j.ceramint.2013.09.069,2015-09-01,P1Y6M,no,no
02001000007362801000805046300010563030608046333-02001000007362801000805046300000963090901036304,10.1007/s10854-015-3684-x,10.1007/s10854-009-9913-4,2015-09-01,P6Y3M10D,yes,no
02001000007362801000805046300010563030608046333-02001000106360000030863010009085807025909000307006305,10.1007/s10854-015-3684-x,10.1016/0038-1098(72)90370-5,2015-09-01,P43Y8M,no,no
02001000007362801000805046300010563030608056309-02001000106361937281010370200010437000937000308,10.1007/s10854-015-3685-9,10.1016/j.saa.2014.09.038,2015-09-03,P0Y7M,no,no
02001000007362801000805046300010563030608056309-0200100010636193722102912171027370200010537000437000106,10.1007/s10854-015-3685-9,10.1016/j.matchar.2015.04.016,2015-09-03,P0Y2M,no,no

View File

@ -1,8 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
02001000308362804010509076300010963000003086301-0200100020936020001003227000009010004,10.1038/s41597-019-0038-1,10.1029/2010wr009104,2019-04-15,P8Y1M,no,no
02001000308362804010509076300010963000003086301-0200100010636280103060463080105025800015900000006006303,10.1038/s41597-019-0038-1,10.1016/s1364-8152(01)00060-3,2019-04-15,P17Y3M,no,no
02001000308362804010509076300010963000003086301-02001000007362800000407076300010063000401066333,10.1038/s41597-019-0038-1,10.1007/s00477-010-0416-x,2019-04-15,P8Y9M6D,no,no
02001000308362804010509076300010963000003086301-02001000007362800000700046300010363000905016308,10.1038/s41597-019-0038-1,10.1007/s00704-013-0951-8,2019-04-15,P5Y9M23D,no,no
02001000308362804010509076300010963000003086301-02001000002361924123705070707,10.1038/s41597-019-0038-1,10.1002/joc.5777,2019-04-15,P0Y8M1D,no,no
02001000308362804010509076300010963000003086301-02005010904361714282863020263040504076302000108,10.1038/s41597-019-0038-1,10.5194/hess-22-4547-2018,2019-04-15,P0Y7M18D,no,no
02001000308362804010509076300010963000003086301-02001000002361924123703050404,10.1038/s41597-019-0038-1,10.1002/joc.3544,2019-04-15,P6Y9M6D,no,no

View File

@ -1,9 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
0200100000236090708010101090307000202023727141528-020050302063600040000010307,10.1002/9781119370222.refs,10.5326/0400137,2020-06-22,P16Y3M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020000073700000301093733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2007.00319.x,2020-06-22,P12Y8M,no,no
0200100000236090708010101090307000202023727141528-0200101010136312830370102030509,10.1002/9781119370222.refs,10.1111/vsu.12359,2020-06-22,P4Y10M29D,no,no
0200100000236090708010101090307000202023727141528-020050302063600030900020904,10.1002/9781119370222.refs,10.5326/0390294,2020-06-22,P17Y1M,no,no
0200100000236090708010101090307000202023727141528-020050302063600040200030701,10.1002/9781119370222.refs,10.5326/0420371,2020-06-22,P13Y9M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020001033701020000003733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2013.12000.x,2020-06-22,P7Y2M,no,no
0200100000236090708010101090307000202023727141528-020010008003600000408000106093702000006370306070200,10.1002/9781119370222.refs,10.1080/00480169.2006.36720,2020-06-22,P13Y6M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701070501630008010337020000063700000003033733,10.1002/9781119370222.refs,10.1111/j.1751-0813.2006.00033.x,2020-06-22,P13Y8M,no,no

View File

@ -1,123 +1,94 @@
{
"ip_addresses": [],
"aliases": [],
"acronyms": [
"ANU"
],
"links": [
"http://www.anu.edu.au/"
],
"country": {
"country_code": "AU",
"country_name": "Australia"
},
"name": "Australian National University",
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
"addresses": [
{
"lat": -35.2778,
"state_code": "AU-ACT",
"country_geonames_id": 2077456,
"lng": 149.1205,
"state": "Australian Capital Territory",
"city": "Canberra",
"geonames_city": {
"nuts_level2": {
"name": null,
"code": null
},
"geonames_admin2": {
"ascii_name": null,
"id": null,
"name": null,
"code": null
},
"geonames_admin1": {
"ascii_name": "ACT",
"id": 2177478,
"name": "ACT",
"code": "AU.01"
},
"city": "Canberra",
"id": 2172517,
"nuts_level1": {
"name": null,
"code": null
},
"nuts_level3": {
"name": null,
"code": null
},
"license": {
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
"license": "http://creativecommons.org/licenses/by/3.0/"
}
},
"postcode": null,
"primary": false,
"line": null
}
],
"types": [
"Education"
],
"established": 1946,
"relationships": [
{
"type": "Related",
"id": "https://ror.org/041c7s516",
"label": "Calvary Hospital"
},
{
"type": "Related",
"id": "https://ror.org/04h7nbn38",
"label": "Canberra Hospital"
},
{
"type": "Related",
"id": "https://ror.org/030jpqj15",
"label": "Goulburn Base Hospital"
},
{
"type": "Child",
"id": "https://ror.org/006a4jj40",
"label": "Mount Stromlo Observatory"
}
],
"email_address": null,
"external_ids": {
"Wikidata": {
"all": [
"Q127990"
],
"preferred": null
},
"OrgRef": {
"all": [
"285106"
],
"preferred": null
},
"ISNI": {
"all": [
"0000 0001 2180 7477"
],
"preferred": null
},
"FundRef": {
"all": [
"501100000995",
"501100001151",
"100009020"
],
"preferred": "501100000995"
},
"GRID": {
"all": "grid.1001.0",
"preferred": "grid.1001.0"
}
},
"id": "https://ror.org/019wvm592",
"labels": [],
"status": "active"
{
"ip_addresses": [],
"aliases": [],
"acronyms": [
"MSO"
],
"links": [
"https://rsaa.anu.edu.au/observatories/mount-stromlo-observatory"
],
"country": {
"country_code": "AU",
"country_name": "Australia"
},
"name": "Mount Stromlo Observatory",
"wikipedia_url": "https://en.wikipedia.org/wiki/Mount_Stromlo_Observatory",
"addresses": [
{
"lat": -35.320278,
"state_code": "AU-ACT",
"country_geonames_id": 2077456,
"lng": 149.006944,
"state": "Australian Capital Territory",
"city": "Canberra",
"geonames_city": {
"nuts_level2": {
"name": null,
"code": null
},
"geonames_admin2": {
"ascii_name": null,
"id": null,
"name": null,
"code": null
},
"geonames_admin1": {
"ascii_name": "ACT",
"id": 2177478,
"name": "ACT",
"code": "AU.01"
},
"city": "Canberra",
"id": 2172517,
"nuts_level1": {
"name": null,
"code": null
},
"nuts_level3": {
"name": null,
"code": null
},
"license": {
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
"license": "http://creativecommons.org/licenses/by/3.0/"
}
},
"postcode": null,
"primary": false,
"line": null
}
],
"types": [
"Education"
],
"established": 1924,
"relationships": [
{
"type": "Parent",
"id": "https://ror.org/019wvm592",
"label": "Australian National University"
}
],
"email_address": null,
"external_ids": {
"ISNI": {
"all": [
"0000 0004 0459 2816"
],
"preferred": null
},
"Wikidata": {
"all": [
"Q1310548"
],
"preferred": null
},
"GRID": {
"all": "grid.440325.4",
"preferred": "grid.440325.4"
}
},
"id": "https://ror.org/006a4jj40",
"labels": [],
"status": "active"
}

View File

@ -0,0 +1,12 @@
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":4}
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":1}
{"result_id":"doi_________::17eda2ff77407538fbe5d3d719b9d1c0","downloads":0,"views":1}
{"result_id":"doi_________::1d4dc08605fd0a2be1105d30c63bfea1","downloads":1,"views":3}
{"result_id":"doi_________::2e3527822854ca9816f6dfea5bff61a8","downloads":1,"views":1}
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":2,"views":3}
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":0,"views":3}
{"result_id":"doi_________::33f710e6dd30cc5e67e35b371ddc33cf","downloads":0,"views":1}
{"result_id":"doi_________::39738ebf10654732dd3a7af9f24655f8","downloads":1,"views":3}
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":1,"views":8}
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":0,"views":2}
{"result_id":"doi_________::4938a71a884dd481d329657aa543b850","downloads":0,"views":3}

View File

@ -70,6 +70,8 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
assertEquals(100, nativeSize)
spark.read.load(targetPath).printSchema();
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -1,11 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -0,0 +1,192 @@
package eu.dnetlib.dhp.broker.oa;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.ConditionParams;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.model.MappedFields;
import eu.dnetlib.dhp.broker.model.Notification;
import eu.dnetlib.dhp.broker.model.Subscription;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
public class GenerateNotificationsJob {
private static final Logger log = LoggerFactory.getLogger(GenerateNotificationsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateNotificationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_notifications.json")));
parser.parseArgument(args);
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String notificationsPath = parser.get("outputDir") + "/notifications";
log.info("notificationsPath: {}", notificationsPath);
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final LongAccumulator total = spark.sparkContext().longAccumulator("total_notifications");
final long startTime = new Date().getTime();
final List<Subscription> subscriptions = listSubscriptions(brokerApiBaseUrl);
log.info("Number of subscriptions: " + subscriptions.size());
if (subscriptions.size() > 0) {
final Map<String, Map<String, List<ConditionParams>>> conditionsMap = prepareConditionsMap(subscriptions);
log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap));
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
final Dataset<Notification> notifications = ClusterUtils
.readPath(spark, eventsPath, Event.class)
.map(
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(
e, subscriptions, conditionsMap, startTime),
ngEncoder)
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
ClusterUtils.save(notifications, notificationsPath, Notification.class, total);
}
}
protected static Map<String, Map<String, List<ConditionParams>>> prepareConditionsMap(
final List<Subscription> subscriptions) {
final Map<String, Map<String, List<ConditionParams>>> map = new HashMap<>();
subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap()));
return map;
}
protected static NotificationGroup generateNotifications(final Event e,
final List<Subscription> subscriptions,
final Map<String, Map<String, List<ConditionParams>>> conditionsMap,
final long date) {
final List<Notification> list = subscriptions
.stream()
.filter(
s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
.filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId())))
.map(s -> generateNotification(s, e, date))
.collect(Collectors.toList());
return new NotificationGroup(list);
}
private static Notification generateNotification(final Subscription s, final Event e, final long date) {
final Notification n = new Notification();
n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId()));
n.setSubscriptionId(s.getSubscriptionId());
n.setEventId(e.getEventId());
n.setProducerId(e.getProducerId());
n.setTopic(e.getTopic());
n.setPayload(e.getPayload());
n.setMap(e.getMap());
n.setDate(date);
return n;
}
private static boolean verifyConditions(final MappedFields map,
final Map<String, List<ConditionParams>> conditions) {
if (conditions.containsKey("targetDatasourceName")
&& !SubscriptionUtils
.verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) {
return false;
}
if (conditions.containsKey("trust")
&& !SubscriptionUtils
.verifyFloatRange(
map.getTrust(), conditions.get("trust").get(0).getValue(),
conditions.get("trust").get(0).getOtherValue())) {
return false;
}
if (conditions.containsKey("targetDateofacceptance") && !conditions
.get("targetDateofacceptance")
.stream()
.anyMatch(
c -> SubscriptionUtils
.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
return false;
}
if (conditions.containsKey("targetResultTitle")
&& !conditions
.get("targetResultTitle")
.stream()
.anyMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) {
return false;
}
if (conditions.containsKey("targetAuthors")
&& !conditions
.get("targetAuthors")
.stream()
.allMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) {
return false;
}
return !conditions.containsKey("targetSubjects")
|| conditions
.get("targetSubjects")
.stream()
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
}
private static List<Subscription> listSubscriptions(final String brokerApiBaseUrl) throws Exception {
final String url = brokerApiBaseUrl + "/api/subscriptions";
final HttpGet req = new HttpGet(url);
final ObjectMapper mapper = new ObjectMapper();
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String s = IOUtils.toString(response.getEntity().getContent());
return mapper
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class));
}
}
}
}

View File

@ -2,15 +2,10 @@
package eu.dnetlib.dhp.broker.oa;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpGet;
@ -18,10 +13,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
@ -33,10 +25,8 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.*;
import eu.dnetlib.dhp.broker.model.Notification;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
public class IndexNotificationsJob {
@ -53,8 +43,8 @@ public class IndexNotificationsJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String notificationsPath = parser.get("outputDir") + "/notifications";
log.info("notificationsPath: {}", notificationsPath);
final String index = parser.get("index");
log.info("index: {}", index);
@ -81,143 +71,41 @@ public class IndexNotificationsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_indexed");
final long startTime = new Date().getTime();
final Long date = ClusterUtils
.readPath(spark, notificationsPath, Notification.class)
.first()
.getDate();
final List<Subscription> subscriptions = listSubscriptions(brokerApiBaseUrl);
final JavaRDD<String> toIndexRdd = ClusterUtils
.readPath(spark, notificationsPath, Notification.class)
.map((MapFunction<Notification, String>) n -> prepareForIndexing(n, total), Encoders.STRING())
.javaRDD();
log.info("Number of subscriptions: {}", subscriptions.size());
final Map<String, String> esCfg = new HashMap<>();
if (!subscriptions.isEmpty()) {
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
final Dataset<Notification> notifications = ClusterUtils
.readPath(spark, eventsPath, Event.class)
.map(
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(e, subscriptions, startTime),
ngEncoder)
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
final JavaRDD<String> inputRdd = notifications
.map((MapFunction<Notification, String>) n -> prepareForIndexing(n, total), Encoders.STRING())
.javaRDD();
log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(toIndexRdd, index, esCfg);
log.info("*** End indexing");
final Map<String, String> esCfg = new HashMap<>();
log.info("*** Deleting old notifications");
final String message = deleteOldNotifications(brokerApiBaseUrl, date - 1000);
log.info("*** Deleted notifications: {}", message);
esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
log.info("*** End indexing");
log.info("*** Deleting old notifications");
final String message = deleteOldNotifications(brokerApiBaseUrl, startTime - 1000);
log.info("*** Deleted notifications: {}", message);
log.info("*** sendNotifications (emails, ...)");
sendNotifications(brokerApiBaseUrl, startTime - 1000);
log.info("*** ALL done.");
}
}
private static NotificationGroup generateNotifications(final Event e,
final List<Subscription> subscriptions,
final long date) {
final List<Notification> list = subscriptions
.stream()
.filter(
s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
.filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap()))
.map(s -> generateNotification(s, e, date))
.collect(Collectors.toList());
return new NotificationGroup(list);
}
private static Notification generateNotification(final Subscription s, final Event e, final long date) {
final Notification n = new Notification();
n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId()));
n.setSubscriptionId(s.getSubscriptionId());
n.setEventId(e.getEventId());
n.setProducerId(e.getProducerId());
n.setTopic(e.getTopic());
n.setPayload(e.getPayload());
n.setMap(e.getMap());
n.setDate(date);
return n;
}
private static boolean verifyConditions(final MappedFields map,
final Map<String, List<ConditionParams>> conditions) {
if (conditions.containsKey("targetDatasourceName")
&& !SubscriptionUtils
.verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) {
return false;
}
if (conditions.containsKey("trust")
&& !SubscriptionUtils
.verifyFloatRange(
map.getTrust(), conditions.get("trust").get(0).getValue(),
conditions.get("trust").get(0).getOtherValue())) {
return false;
}
if (conditions.containsKey("targetDateofacceptance") && conditions
.get("targetDateofacceptance")
.stream()
.noneMatch(
c -> SubscriptionUtils
.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
return false;
}
if (conditions.containsKey("targetResultTitle")
&& conditions
.get("targetResultTitle")
.stream()
.noneMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) {
return false;
}
if (conditions.containsKey("targetAuthors")
&& conditions
.get("targetAuthors")
.stream()
.noneMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) {
return false;
}
return !conditions.containsKey("targetSubjects")
|| conditions
.get("targetSubjects")
.stream()
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
log.info("*** sendNotifications (emails, ...)");
sendNotifications(brokerApiBaseUrl, date - 1000);
log.info("*** ALL done.");
}
private static List<Subscription> listSubscriptions(final String brokerApiBaseUrl) throws IOException {
final String url = brokerApiBaseUrl + "/api/subscriptions";
final HttpGet req = new HttpGet(url);
final ObjectMapper mapper = new ObjectMapper();
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String s = IOUtils.toString(response.getEntity().getContent());
return mapper
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class));
}
}
}
private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws IOException {
private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws Exception {
final String url = brokerApiBaseUrl + "/api/notifications/byDate/0/" + l;
final HttpDelete req = new HttpDelete(url);

View File

@ -115,6 +115,11 @@
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>sparkMaxExecutorsForIndexing</name>
<value>8</value>
<description>Max number of workers for ElasticSearch indexing</description>
</property>
</parameters>
<global>
@ -498,7 +503,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -542,6 +547,30 @@
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="generate_notifications"/>
<error to="Kill"/>
</action>
<action name="generate_notifications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateNotificationsJob</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="index_notifications"/>
<error to="Kill"/>
</action>
@ -556,7 +585,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -0,0 +1,14 @@
[
{
"paramName": "o",
"paramLongName": "outputDir",
"paramDescription": "the dir that contains the events folder",
"paramRequired": true
},
{
"paramName": "broker",
"paramLongName": "brokerApiBaseUrl",
"paramDescription": "the url of the broker service api",
"paramRequired": true
}
]

View File

@ -98,6 +98,11 @@
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>sparkMaxExecutorsForIndexing</name>
<value>8</value>
<description>Max number of workers for ElasticSearch indexing</description>
</property>
</parameters>
<global>
@ -119,12 +124,36 @@
</configuration>
</global>
<start to="index_notifications"/>
<start to="generate_notifications"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_notifications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateNotificationsJob</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="index_notifications"/>
<error to="Kill"/>
</action>
<action name="index_notifications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -135,7 +164,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -75,6 +75,11 @@
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>sparkMaxExecutorsForIndexing</name>
<value>8</value>
<description>Max number of workers for ElasticSearch indexing</description>
</property>
</parameters>
<global>
@ -112,7 +117,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -0,0 +1,133 @@
package eu.dnetlib.dhp.broker.oa;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.broker.model.ConditionParams;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.model.MappedFields;
import eu.dnetlib.dhp.broker.model.Subscription;
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
class GenerateNotificationsJobTest {
private List<Subscription> subscriptions;
private Map<String, Map<String, List<ConditionParams>>> conditionsMap;
private static final int N_TIMES = 1_000_000;
@BeforeEach
void setUp() throws Exception {
final Subscription s = new Subscription();
s.setTopic("ENRICH/MISSING/PID");
s
.setConditions(
"[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]");
subscriptions = Arrays.asList(s);
conditionsMap = GenerateNotificationsJob.prepareConditionsMap(subscriptions);
}
@Test
void testGenerateNotifications_invalid_topic() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PROJECT");
final NotificationGroup res = GenerateNotificationsJob
.generateNotifications(event, subscriptions, conditionsMap, 0);
assertEquals(0, res.getData().size());
}
@Test
void testGenerateNotifications_topic_match() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PID");
event.setMap(new MappedFields());
event.getMap().setTargetDatasourceName("reposiTUm");
event.getMap().setTrust(0.8f);
final NotificationGroup res = GenerateNotificationsJob
.generateNotifications(event, subscriptions, conditionsMap, 0);
assertEquals(1, res.getData().size());
}
@Test
void testGenerateNotifications_topic_no_match() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PID");
event.setMap(new MappedFields());
event.getMap().setTargetDatasourceName("Puma");
event.getMap().setTrust(0.8f);
final NotificationGroup res = GenerateNotificationsJob
.generateNotifications(event, subscriptions, conditionsMap, 0);
assertEquals(0, res.getData().size());
}
@Test
void testGenerateNotifications_invalid_topic_repeated() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PROJECT");
// warm up
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
final long start = System.currentTimeMillis();
for (int i = 0; i < N_TIMES; i++) {
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
}
final long end = System.currentTimeMillis();
System.out
.println(String.format("no topic - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
}
@Test
void testGenerateNotifications_topic_match_repeated() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PID");
event.setMap(new MappedFields());
event.getMap().setTargetDatasourceName("reposiTUm");
event.getMap().setTrust(0.8f);
// warm up
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
final long start = System.currentTimeMillis();
for (int i = 0; i < N_TIMES; i++) {
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
}
final long end = System.currentTimeMillis();
System.out
.println(String.format("topic match - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
}
@Test
void testGenerateNotifications_topic_no_match_repeated() {
final Event event = new Event();
event.setTopic("ENRICH/MISSING/PID");
event.setMap(new MappedFields());
event.getMap().setTargetDatasourceName("Puma");
event.getMap().setTrust(0.8f);
// warm up
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
final long start = System.currentTimeMillis();
for (int i = 0; i < N_TIMES; i++) {
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
}
final long end = System.currentTimeMillis();
System.out
.println(
String.format("topic no match - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
}
}

View File

@ -0,0 +1,132 @@
package eu.dnetlib.dhp.broker.oa.samples;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.broker.model.ConditionParams;
import eu.dnetlib.dhp.broker.model.MapCondition;
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
@Disabled
public class SimpleVariableJobTest {
private static final Logger log = LoggerFactory.getLogger(SimpleVariableJobTest.class);
private static Path workingDir;
private static SparkSession spark;
private final static List<String> inputList = new ArrayList<>();
private static final Map<String, Map<String, List<ConditionParams>>> staticMap = new HashMap<>();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(SimpleVariableJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
final SparkConf conf = new SparkConf();
conf.setAppName(SimpleVariableJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
// conf.set("spark.sql.warehouse.dir", workingDir.toString());
// conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SimpleVariableJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
for (int i = 0; i < 1_000_000; i++) {
inputList.add("record " + i);
}
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testSimpleVariableJob() throws Exception {
final Map<String, Map<String, List<ConditionParams>>> map = fillMap();
final long n = spark
.createDataset(inputList, Encoders.STRING())
.filter(s -> filter(map.get(s)))
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
.count();
System.out.println(n);
}
@Test
public void testSimpleVariableJob_static() throws Exception {
staticMap.putAll(fillMap());
final long n = spark
.createDataset(inputList, Encoders.STRING())
.filter(s -> filter(staticMap.get(s)))
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
.count();
System.out.println(n);
}
private static Map<String, Map<String, List<ConditionParams>>> fillMap()
throws JsonParseException, JsonMappingException, IOException {
final String s = "[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]";
final ObjectMapper mapper = new ObjectMapper();
final List<MapCondition> list = mapper
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, MapCondition.class));
final Map<String, List<ConditionParams>> conditions = list
.stream()
.filter(mc -> !mc.getListParams().isEmpty())
.collect(Collectors.toMap(MapCondition::getField, MapCondition::getListParams));
final Map<String, Map<String, List<ConditionParams>>> map = new HashMap<>();
inputList.forEach(i -> map.put(i, conditions));
return map;
}
private static boolean filter(final Map<String, List<ConditionParams>> conditions) {
if (conditions.containsKey("targetDatasourceName")
&& !SubscriptionUtils
.verifyExact("reposiTUm", conditions.get("targetDatasourceName").get(0).getValue())) {
return false;
}
return true;
}
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>

View File

@ -77,6 +77,7 @@ public class DedupRecordFactory {
throws IllegalAccessException, InstantiationException {
T entity = clazz.newInstance();
entity.setDataInfo(dataInfo);
final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList();

View File

@ -104,7 +104,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>group graph entities</name>
<class>eu.dnetlib.dhp.oa.dedup.GroupEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -138,7 +138,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch publications</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -163,7 +163,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch project</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -188,7 +188,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch organization</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -213,7 +213,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch publication</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -238,7 +238,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch dataset</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -263,7 +263,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch software</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -288,7 +288,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dispatch otherresearchproduct</name>
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}

View File

@ -77,7 +77,16 @@ class EntityMergerTest implements Serializable {
// verify id
assertEquals(dedupId, pub_merged.getId());
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
assertEquals(pub_top.getJournal().getName(), pub_merged.getJournal().getName());
assertEquals(pub_top.getJournal().getIssnOnline(), pub_merged.getJournal().getIssnOnline());
assertEquals(pub_top.getJournal().getIssnLinking(), pub_merged.getJournal().getIssnLinking());
assertEquals(pub_top.getJournal().getIssnPrinted(), pub_merged.getJournal().getIssnPrinted());
assertEquals(pub_top.getJournal().getIss(), pub_merged.getJournal().getIss());
assertEquals(pub_top.getJournal().getEp(), pub_merged.getJournal().getEp());
assertEquals(pub_top.getJournal().getSp(), pub_merged.getJournal().getSp());
assertEquals(pub_top.getJournal().getVol(), pub_merged.getJournal().getVol());
assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());

View File

@ -206,11 +206,16 @@ public class SparkDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
.count();
assertEquals(3082, orgs_simrel);
assertEquals(7036, pubs_simrel);
assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel);
assertEquals(336, sw_simrel);
assertEquals(442, ds_simrel);
assertEquals(6750, orp_simrel);
assertEquals(6784, orp_simrel);
// System.out.println("orgs_simrel = " + orgs_simrel);
// System.out.println("pubs_simrel = " + pubs_simrel);
// System.out.println("sw_simrel = " + sw_simrel);
// System.out.println("ds_simrel = " + ds_simrel);
// System.out.println("orp_simrel = " + orp_simrel);
}
@Test
@ -258,10 +263,14 @@ public class SparkDedupTest implements Serializable {
.count();
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
assertEquals(3082, orgs_simrel);
assertEquals(7036, pubs_simrel);
assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel);
assertEquals(442, ds_simrel);
assertEquals(6750, orp_simrel);
assertEquals(6784, orp_simrel);
// System.out.println("orgs_simrel = " + orgs_simrel);
// System.out.println("pubs_simrel = " + pubs_simrel);
// System.out.println("ds_simrel = " + ds_simrel);
// System.out.println("orp_simrel = " + orp_simrel);
// entities simrels to be different from the number of previous step (new simrels in the whitelist)
Dataset<Row> sw_simrel = spark
@ -288,6 +297,7 @@ public class SparkDedupTest implements Serializable {
.count() > 0);
assertEquals(338, sw_simrel.count());
// System.out.println("sw_simrel = " + sw_simrel.count());
}
@ -435,11 +445,16 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count();
assertEquals(1272, orgs_mergerel);
assertEquals(1438, pubs_mergerel);
assertEquals(1268, orgs_mergerel);
assertEquals(1444, pubs_mergerel);
assertEquals(286, sw_mergerel);
assertEquals(472, ds_mergerel);
assertEquals(718, orp_mergerel);
assertEquals(738, orp_mergerel);
// System.out.println("orgs_mergerel = " + orgs_mergerel);
// System.out.println("pubs_mergerel = " + pubs_mergerel);
// System.out.println("sw_mergerel = " + sw_mergerel);
// System.out.println("ds_mergerel = " + ds_mergerel);
// System.out.println("orp_mergerel = " + orp_mergerel);
}
@ -483,11 +498,17 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count();
assertEquals(85, orgs_deduprecord);
assertEquals(65, pubs_deduprecord);
assertEquals(86, orgs_deduprecord);
assertEquals(67, pubs_deduprecord);
assertEquals(49, sw_deduprecord);
assertEquals(97, ds_deduprecord);
assertEquals(89, orp_deduprecord);
assertEquals(92, orp_deduprecord);
// System.out.println("orgs_deduprecord = " + orgs_deduprecord);
// System.out.println("pubs_deduprecord = " + pubs_deduprecord);
// System.out.println("sw_deduprecord = " + sw_deduprecord);
// System.out.println("ds_deduprecord = " + ds_deduprecord);
// System.out.println("orp_deduprecord = " + orp_deduprecord);
}
@Test
@ -566,13 +587,21 @@ public class SparkDedupTest implements Serializable {
.distinct()
.count();
assertEquals(896, publications);
assertEquals(838, organizations);
assertEquals(898, publications);
assertEquals(839, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(198, softwares);
assertEquals(389, dataset);
assertEquals(517, otherresearchproduct);
assertEquals(520, otherresearchproduct);
// System.out.println("publications = " + publications);
// System.out.println("organizations = " + organizations);
// System.out.println("projects = " + projects);
// System.out.println("datasource = " + datasource);
// System.out.println("software = " + softwares);
// System.out.println("dataset = " + dataset);
// System.out.println("otherresearchproduct = " + otherresearchproduct);
long deletedOrgs = jsc
.textFile(testDedupGraphBasePath + "/organization")
@ -626,7 +655,8 @@ public class SparkDedupTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4860, relations);
// assertEquals(4860, relations);
System.out.println("relations = " + relations);
// check deletedbyinference
final Dataset<Relation> mergeRels = spark

View File

@ -0,0 +1,214 @@
{
"wf": {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "otherresearchproduct",
"orderField" : "title",
"queueMaxSize" : "100",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "100",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
},
"pace": {
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "pidVSaltid",
"undefined": "pidVSaltid",
"ignoreUndefined": "false"
},
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"earlyExits": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "strongCheck",
"negative": "NO_MATCH",
"undefined": "strongCheck",
"ignoreUndefined": "false"
},
"strongCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "surname"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -0,0 +1,475 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "instanceTypeCheck",
"undefined": "instanceTypeCheck",
"ignoreUndefined": "false"
},
"instanceTypeCheck": {
"fields": [
{
"field": "instance",
"comparator": "instanceTypeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "pidVSaltid",
"negative": "NO_MATCH",
"undefined": "pidVSaltid",
"ignoreUndefined": "true"
},
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"earlyExits": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "strongCheck",
"negative": "NO_MATCH",
"undefined": "strongCheck",
"ignoreUndefined": "false"
},
"strongCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "surname"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {
"title": [
"(?i)^Data Management Plan",
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries"
]
},
"synonyms": {}
}
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -1,19 +1,13 @@
package eu.dnetlib.doiboost.crossref;
import java.io.BufferedOutputStream;
import java.net.URI;
import java.util.zip.GZIPOutputStream;
import static eu.dnetlib.dhp.common.collection.DecompressTarGz.doExtract;
import java.net.URI;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -33,31 +27,16 @@ public class ExtractCrossrefRecords {
final String outputPath = parser.get("outputPath");
final String crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
Path hdfsreadpath = new Path(workingPath.concat("/").concat(crossrefFileNameTarGz));
Configuration conf = new Configuration();
conf.set("fs.defaultFS", workingPath);
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem fs = FileSystem.get(URI.create(workingPath), conf);
FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(crossrefFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
try (
FSDataOutputStream out = fs
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
doExtract(fs, outputPath, workingPath.concat("/").concat(crossrefFileNameTarGz));
}
}
}
}
Log.info("Crossref dump reading completed");
}
}

View File

@ -59,52 +59,6 @@ object SparkGenerateDoiBoost {
val workingDirPath = parser.get("workingPath")
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
override def zero: Publication = new Publication
override def reduce(b: Publication, a: (String, Publication)): Publication = {
if (b == null) {
if (a != null && a._2 != null) {
a._2.setId(a._1)
return a._2
}
} else {
if (a != null && a._2 != null) {
b.mergeFrom(a._2)
b.setId(a._1)
val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
b.setAuthor(authors)
return b
}
}
new Publication
}
override def merge(b1: Publication, b2: Publication): Publication = {
if (b1 == null) {
if (b2 != null)
return b2
} else {
if (b2 != null) {
b1.mergeFrom(b2)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
b1.setAuthor(authors)
if (b2.getId != null && b2.getId.nonEmpty)
b1.setId(b2.getId)
return b1
}
}
new Publication
}
override def finish(reduction: Publication): Publication = reduction
override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication]
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
}
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
@ -175,8 +129,33 @@ object SparkGenerateDoiBoost {
.map(DoiBoostMappingUtil.fixPublication)
.map(p => (p.getId, p))
.groupByKey(_._1)
.agg(crossrefAggregator.toColumn)
.map(p => p._2)
.reduceGroups((left, right) => {
//Check left is not null
if (left != null && left._1 != null) {
//If right is null then return left
if (right == null || right._2 == null)
left
else {
// Here Left and Right are not null
// So we have to merge
val b1 = left._2
val b2 = right._2
b1.mergeFrom(b2)
b1.mergeOAFDataInfo(b2)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
b1.setAuthor(authors)
if (b2.getId != null && b2.getId.nonEmpty)
b1.setId(b2.getId)
//Return publication Merged
(b1.getId, b1)
}
} else {
// Left is Null so we return right
right
}
})
.filter(s => s != null && s._2 != null)
.map(s => s._2._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingDirPath/doiBoostPublicationFiltered")

View File

@ -446,16 +446,12 @@ case object Crossref2Oaf {
case "10.13039/501100000781" =>
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" =>
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" =>
generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" =>
generateSimpleRelationFromAward(funder, "arc_________", a => a)
generateSimpleRelationFromAward(funder, "sfi_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038" =>
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
@ -468,14 +464,10 @@ case object Crossref2Oaf {
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100002848" =>
generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" =>
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" =>
generateSimpleRelationFromAward(funder, "sgov________", a => a)
case "10.13039/501100004564" =>
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a => a)
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" =>
generateSimpleRelationFromAward(funder, "miur________", a => a)
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
@ -487,15 +479,11 @@ case object Crossref2Oaf {
"irb_hr______",
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
)
case "10.13039/501100006769" =>
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
case "10.13039/501100001711" =>
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410" =>
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
case "10.10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
case "10.13039/501100006769" => generateSimpleRelationFromAward(funder, "rsf_________", a => a)
case "10.13039/501100001711" => generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410" => generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
case "10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
@ -516,6 +504,7 @@ case object Crossref2Oaf {
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)

View File

@ -1456,7 +1456,7 @@
"issued": {
"date-parts": [
[
2021,
3021,
2,
22
]

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -51,7 +51,7 @@
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-aggregation</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<scope>compile</scope>
</dependency>

View File

@ -95,13 +95,14 @@ public class ResultTagger implements Serializable {
}
result
.getInstance()
.stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new))
// result
// .getInstance()
// .stream()
// .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
// .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
// .map(s -> StringUtils.substringAfter(s, "|"))
// .collect(Collectors.toCollection(HashSet::new))
tmp
.forEach(
dsId -> datasources
.addAll(

View File

@ -22,4 +22,11 @@ public class CountrySbs implements Serializable {
public void setClassname(String classname) {
this.classname = classname;
}
public static CountrySbs newInstance(String classid, String classname) {
CountrySbs csbs = new CountrySbs();
csbs.classid = classid;
csbs.classname = classname;
return csbs;
}
}

View File

@ -22,4 +22,11 @@ public class DatasourceCountry implements Serializable {
public void setCountry(CountrySbs country) {
this.country = country;
}
public static DatasourceCountry newInstance(String dataSourceId, CountrySbs country) {
DatasourceCountry dsc = new DatasourceCountry();
dsc.dataSourceId = dataSourceId;
dsc.country = country;
return dsc;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.countrypropagation;
import java.io.Serializable;
public class EntityEntityRel implements Serializable {
private String entity1Id;
private String entity2Id;
public static EntityEntityRel newInstance(String source, String target) {
EntityEntityRel dso = new EntityEntityRel();
dso.entity1Id = source;
dso.entity2Id = target;
return dso;
}
public String getEntity1Id() {
return entity1Id;
}
public void setEntity1Id(String entity1Id) {
this.entity1Id = entity1Id;
}
public String getEntity2Id() {
return entity2Id;
}
public void setEntity2Id(String entity2Id) {
this.entity2Id = entity2Id;
}
}

View File

@ -2,14 +2,16 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -17,11 +19,15 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
* For the association of the country to the datasource The association is computed only for datasource of specific type
@ -54,9 +60,8 @@ public class PrepareDatasourceCountryAssociation {
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
@ -77,40 +82,46 @@ public class PrepareDatasourceCountryAssociation {
String inputPath,
String outputPath) {
final String whitelisted = whitelist
.stream()
.map(id -> " d.id = '" + id + "'")
.collect(Collectors.joining(" OR "));
// filtering of the datasource taking only the non deleted by inference and those with the allowed types or
// whose id is in whitelist
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
(allowedtypes.contains(ds.getDatasourcetype().getClassid()) ||
whitelist.contains(ds.getId())));
final String allowed = allowedtypes
.stream()
.map(type -> " d.datasourcetype.classid = '" + type + "'")
.collect(Collectors.joining(" OR "));
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
!rel.getDataInfo().getDeletedbyinference());
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
// filtering of the organization taking only the non deleted by inference and those with information about the
// country
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class)
.filter(
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
o.getCountry().getClassid().length() > 0 &&
!o.getCountry().getClassid().equals(ModelConstants.UNKNOWN));
datasource.createOrReplaceTempView("datasource");
relation.createOrReplaceTempView("relation");
organization.createOrReplaceTempView("organization");
// associated the datasource id with the id of the organization providing the datasource
Dataset<EntityEntityRel> dse = datasource
.joinWith(relation, datasource.col("id").equalTo(relation.col("source")))
.map(
(MapFunction<Tuple2<Datasource, Relation>, EntityEntityRel>) t2 -> EntityEntityRel
.newInstance(t2._2.getSource(), t2._2.getTarget()),
Encoders.bean(EntityEntityRel.class));
String query = "SELECT source dataSourceId, " +
"named_struct('classid', country.classid, 'classname', country.classname) country " +
"FROM datasource d " +
"JOIN relation rel " +
"ON d.id = rel.source " +
"JOIN organization o " +
"ON o.id = rel.target " +
"WHERE rel.datainfo.deletedbyinference = false " +
"and lower(rel.relclass) = '" + ModelConstants.IS_PROVIDED_BY.toLowerCase() + "'" +
"and o.datainfo.deletedbyinference = false " +
"and length(o.country.classid) > 0 " +
"and (" + allowed + " or " + whitelisted + ")";
spark
.sql(query)
.as(Encoders.bean(DatasourceCountry.class))
// joins with the information stored in the organization dataset to associate the country to the datasource id
dse
.joinWith(organization, dse.col("entity2Id").equalTo(organization.col("id")))
.map((MapFunction<Tuple2<EntityEntityRel, Organization>, DatasourceCountry>) t2 -> {
Qualifier country = t2._2.getCountry();
return DatasourceCountry
.newInstance(
t2._1.getEntity1Id(),
CountrySbs.newInstance(country.getClassid(), country.getClassname()));
}, Encoders.bean(DatasourceCountry.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)

View File

@ -3,14 +3,21 @@ package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
@ -23,14 +30,6 @@ import scala.Tuple2;
public class PrepareResultCountrySet {
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
+ "FROM ( SELECT id, country "
+ "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
+ "UNION ALL "
+ "SELECT id, country FROM datasource_country "
+ "JOIN cfhb ON hb = dataSourceId ) tmp "
+ "GROUP BY id";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
@ -45,6 +44,8 @@ public class PrepareResultCountrySet {
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String workingPath = parser.get("workingPath");
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
@ -60,9 +61,8 @@ public class PrepareResultCountrySet {
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
@ -72,6 +72,7 @@ public class PrepareResultCountrySet {
inputPath,
outputPath,
datasourcecountrypath,
workingPath,
resultClazz);
});
}
@ -81,43 +82,63 @@ public class PrepareResultCountrySet {
String inputPath,
String outputPath,
String datasourcecountrypath,
String workingPath,
Class<R> resultClazz) {
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result.createOrReplaceTempView("result");
// selects all the results non deleted by inference and non invisible
Dataset<R> result = readPath(spark, inputPath, resultClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible());
createCfHbforResult(spark);
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
// and produces pairs resultId, key for each distinct key associated to the result
result.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
return cfhb
.stream()
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
.collect(Collectors.toList())
.iterator();
}, Encoders.bean(EntityEntityRel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/resultCfHb");
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
datasource_country.createOrReplaceTempView("datasource_country");
spark
.sql(RESULT_COUNTRYSET_QUERY)
.as(Encoders.bean(ResultCountrySet.class))
.toJavaRDD()
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
.reduceByKey((a, b) -> {
ArrayList<CountrySbs> countryList = a.getCountrySet();
Set<String> countryCodes = countryList
.stream()
.map(CountrySbs::getClassid)
.collect(Collectors.toSet());
b
.getCountrySet()
.stream()
.forEach(c -> {
if (!countryCodes.contains(c.getClassid())) {
countryList.add(c);
countryCodes.add(c.getClassid());
}
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath + "/resultCfHb", EntityEntityRel.class);
datasource_country
.joinWith(
cfhb, cfhb
.col("entity2Id")
.equalTo(datasource_country.col("datasourceId")))
.groupByKey(
(MapFunction<Tuple2<DatasourceCountry, EntityEntityRel>, String>) t2 -> t2._2().getEntity1Id(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<DatasourceCountry, EntityEntityRel>, ResultCountrySet>) (k, it) -> {
ResultCountrySet rcs = new ResultCountrySet();
rcs.setResultId(k);
Set<CountrySbs> set = new HashSet<>();
Set<String> countryCodes = new HashSet<>();
DatasourceCountry first = it.next()._1();
countryCodes.add(first.getCountry().getClassid());
set.add(first.getCountry());
it.forEachRemaining(t2 -> {
if (!countryCodes.contains(t2._1().getCountry().getClassid()))
set.add(t2._1().getCountry());
});
a.setCountrySet(countryList);
return a;
})
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
.saveAsTextFile(outputPath, GzipCodec.class);
rcs.setCountrySet(new ArrayList<>(set));
return rcs;
}, Encoders.bean(ResultCountrySet.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -56,12 +56,6 @@ public class SparkCountryPropagationJob {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -75,8 +69,7 @@ public class SparkCountryPropagationJob {
sourcePath,
preparedInfoPath,
outputPath,
resultClazz,
saveGraph);
resultClazz);
});
}
@ -85,27 +78,25 @@ public class SparkCountryPropagationJob {
String sourcePath,
String preparedInfoPath,
String outputPath,
Class<R> resultClazz,
boolean saveGraph) {
Class<R> resultClazz) {
if (saveGraph) {
log.info("Reading Graph table from: {}", sourcePath);
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading Graph table from: {}", sourcePath);
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath);
Dataset<ResultCountrySet> prepared = spark
.read()
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
log.info("Reading prepared info: {}", preparedInfoPath);
Dataset<ResultCountrySet> prepared = spark
.read()
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
.map(getCountryMergeFn(), Encoders.bean(resultClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {

View File

@ -5,18 +5,6 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": false
},
{
"paramName":"sg",
"paramLongName":"saveGraph",
"paramDescription": "true if the new version of the graph must be saved",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",

View File

@ -5,12 +5,6 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",

View File

@ -12,9 +12,9 @@
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramName":"w",
"paramLongName":"workingPath",
"paramDescription": "the working path",
"paramRequired": true
},
{

View File

@ -110,7 +110,6 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--whitelist</arg><arg>${whitelist}</arg>
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="fork_join_prepare_result_country"/>
@ -146,7 +145,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
@ -176,7 +175,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
@ -206,7 +205,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
@ -236,7 +235,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
@ -275,7 +274,6 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark>
@ -305,7 +303,6 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark>
@ -335,7 +332,6 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark>
@ -365,7 +361,6 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
</spark>

Some files were not shown because too many files have changed in this diff Show More