merged from beta
This commit is contained in:
commit
888f2de196
|
@ -0,0 +1,21 @@
|
|||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 120
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
project.excludeFilters = [".*\\.sbt"]
|
||||
rewrite.rules = [AvoidInfix]
|
||||
rewrite.rules = [ExpandImportSelectors]
|
||||
rewrite.rules = [RedundantBraces]
|
||||
rewrite.rules = [RedundantParens]
|
||||
rewrite.rules = [SortImports]
|
||||
rewrite.rules = [SortModifiers]
|
||||
rewrite.rules = [PreferCurlyFors]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-code-style</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
|
||||
|
@ -47,12 +47,16 @@
|
|||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.9.1</version>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
<properties>
|
||||
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
|
||||
</properties>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
|
||||
</parent>
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.collection;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
public class DecompressTarGz {
|
||||
|
||||
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
|
||||
|
||||
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(inputFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (!entry.isDirectory()) {
|
||||
try (
|
||||
FSDataOutputStream out = fs
|
||||
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(tais, gzipOs);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
|
@ -38,7 +38,7 @@ public class DispatchEntitiesSparkJob {
|
|||
.requireNonNull(
|
||||
DispatchEntitiesSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
|
||||
"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||
|
@ -53,7 +53,7 @@ public class GroupEntitiesSparkJob {
|
|||
.toString(
|
||||
GroupEntitiesSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
|
||||
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
|
@ -47,6 +47,17 @@ public class OafMapperUtils {
|
|||
}
|
||||
|
||||
public static Result mergeResults(Result left, Result right) {
|
||||
|
||||
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
|
||||
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
|
||||
|
||||
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
|
||||
return left;
|
||||
}
|
||||
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
||||
return right;
|
||||
}
|
||||
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
left.mergeFrom(right);
|
||||
return left;
|
||||
|
@ -56,6 +67,18 @@ public class OafMapperUtils {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean isFromDelegatedAuthority(Result r) {
|
||||
return Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
|
||||
.map(i -> i.getCollectedfrom().getKey())
|
||||
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
|
||||
.orElse(false);
|
||||
}
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
|
@ -368,4 +391,19 @@ public class OafMapperUtils {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setDataInfo(dataInfo);
|
||||
kv.setKey(key);
|
||||
kv.setValue(value);
|
||||
return kv;
|
||||
}
|
||||
|
||||
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
|
||||
Measure m = new Measure();
|
||||
m.setId(id);
|
||||
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
|
||||
return m;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -185,6 +185,22 @@ class OafMapperUtilsTest {
|
|||
.getClassid());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDelegatedAuthority() throws IOException {
|
||||
Dataset d1 = read("dataset_2.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_delegated.json", Dataset.class);
|
||||
|
||||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
|
||||
|
||||
Result res = OafMapperUtils.mergeResults(d1, d2);
|
||||
|
||||
assertEquals(d2, res);
|
||||
|
||||
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
|
||||
|
||||
}
|
||||
|
||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
|
|
@ -1 +1,140 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
||||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 120
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
project.excludeFilters = [".*\\.sbt"]
|
||||
rewrite.rules = [AvoidInfix]
|
||||
rewrite.rules = [ExpandImportSelectors]
|
||||
rewrite.rules = [RedundantBraces]
|
||||
rewrite.rules = [RedundantParens]
|
||||
rewrite.rules = [SortImports]
|
||||
rewrite.rules = [SortModifiers]
|
||||
rewrite.rules = [PreferCurlyFors]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<build>
|
||||
|
|
|
@ -27,6 +27,8 @@ public class Constants {
|
|||
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
|
||||
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
|
||||
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
|
||||
public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
|
||||
public static final String UPDATE_KEY_USAGE_COUNTS = "count";
|
||||
|
||||
public static final String FOS_CLASS_ID = "FOS";
|
||||
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
|
||||
|
|
|
@ -21,8 +21,10 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareFOSSparkJob implements Serializable {
|
||||
|
@ -71,6 +73,7 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
Result r = new Result();
|
||||
FOSDataModel first = it.next();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
|
||||
HashSet<String> level1 = new HashSet<>();
|
||||
HashSet<String> level2 = new HashSet<>();
|
||||
HashSet<String> level3 = new HashSet<>();
|
||||
|
@ -81,6 +84,19 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -21,8 +21,10 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareSDGSparkJob implements Serializable {
|
||||
|
@ -78,6 +80,19 @@ public class PrepareSDGSparkJob implements Serializable {
|
|||
s -> sbjs
|
||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -67,7 +68,19 @@ public class SparkSaveUnresolved implements Serializable {
|
|||
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
|
||||
Result ret = it.next();
|
||||
it.forEachRemaining(r -> ret.mergeFrom(r));
|
||||
it.forEachRemaining(r -> {
|
||||
if (r.getInstance() != null) {
|
||||
ret.setInstance(r.getInstance());
|
||||
}
|
||||
if (r.getSubject() != null) {
|
||||
if (ret.getSubject() != null)
|
||||
ret.getSubject().addAll(r.getSubject());
|
||||
else
|
||||
ret.setSubject(r.getSubject());
|
||||
}
|
||||
|
||||
// ret.mergeFrom(r)
|
||||
});
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
|
||||
boolean shouldDuplicateRels) {
|
||||
spark
|
||||
.sqlContext()
|
||||
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
|
||||
.read()
|
||||
.textFile(inputPath + "/*")
|
||||
.map(
|
||||
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
|
||||
Encoders.bean(COCI.class))
|
||||
.flatMap(
|
||||
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
||||
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) value -> value != null)
|
||||
.toJavaRDD()
|
||||
|
@ -98,27 +103,31 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static List<Relation> createRelation(String value, boolean duplicate) {
|
||||
String[] line = value.split(",");
|
||||
if (!line[1].startsWith("10.")) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
private static List<Relation> createRelation(COCI value, boolean duplicate) {
|
||||
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
|
||||
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
|
||||
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
|
||||
String citing = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
|
||||
final String cited = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
||||
|
||||
if (!citing.equals(cited)) {
|
||||
relationList
|
||||
.addAll(
|
||||
getRelations(
|
||||
citing,
|
||||
cited));
|
||||
|
||||
if (duplicate && line[1].endsWith(".refs")) {
|
||||
if (duplicate && value.getCiting().endsWith(".refs")) {
|
||||
citing = ID_PREFIX + IdentifierFactory
|
||||
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
|
||||
.md5(
|
||||
CleaningFunctions
|
||||
.normalizePidValue(
|
||||
"doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
|
||||
relationList.addAll(getRelations(citing, cited));
|
||||
}
|
||||
}
|
||||
|
||||
return relationList;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class ReadCOCI implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
ReadCOCI.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String[] inputFile = parser.get("inputFile").split(";");
|
||||
log.info("inputFile {}", inputFile.toString());
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doRead(
|
||||
spark,
|
||||
workingPath,
|
||||
inputFile,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
||||
String outputPath,
|
||||
String delimiter) throws IOException {
|
||||
|
||||
for (String inputFile : inputFiles) {
|
||||
String p_string = workingPath + "/" + inputFile + ".gz";
|
||||
|
||||
Dataset<Row> cociData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(p_string)
|
||||
.repartition(100);
|
||||
|
||||
cociData.map((MapFunction<Row, COCI>) row -> {
|
||||
COCI coci = new COCI();
|
||||
coci.setOci(row.getString(0));
|
||||
coci.setCiting(row.getString(1));
|
||||
coci.setCited(row.getString(2));
|
||||
return coci;
|
||||
}, Encoders.bean(COCI.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + inputFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class COCI implements Serializable {
|
||||
private String oci;
|
||||
|
||||
private String citing;
|
||||
|
||||
private String cited;
|
||||
|
||||
public String getOci() {
|
||||
return oci;
|
||||
}
|
||||
|
||||
public void setOci(String oci) {
|
||||
this.oci = oci;
|
||||
}
|
||||
|
||||
public String getCiting() {
|
||||
return citing;
|
||||
}
|
||||
|
||||
public void setCiting(String citing) {
|
||||
this.citing = citing;
|
||||
}
|
||||
|
||||
public String getCited() {
|
||||
return cited;
|
||||
}
|
||||
|
||||
public void setCited(String cited) {
|
||||
this.cited = cited;
|
||||
}
|
||||
|
||||
}
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.actionmanager.ror;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
|
||||
|
@ -29,8 +30,7 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -38,8 +38,8 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.Relationship;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
@ -48,8 +48,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
@ -112,24 +114,21 @@ public class GenerateRorActionSetJob {
|
|||
final String outputPath) throws IOException {
|
||||
|
||||
readInputPath(spark, inputPath)
|
||||
.map(
|
||||
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
|
||||
Encoders.bean(Organization.class))
|
||||
.toJavaRDD()
|
||||
.map(o -> new AtomicAction<>(Organization.class, o))
|
||||
.map(GenerateRorActionSetJob::convertRorOrg)
|
||||
.flatMap(List::iterator)
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
protected static Organization convertRorOrg(final RorOrganization r) {
|
||||
protected static List<AtomicAction<? extends Oaf>> convertRorOrg(final RorOrganization r) {
|
||||
|
||||
final Date now = new Date();
|
||||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
|
||||
o.setId(calculateOpenaireId(r.getId()));
|
||||
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
|
||||
o.setCollectedfrom(ROR_COLLECTED_FROM);
|
||||
o.setPid(pids(r));
|
||||
|
@ -166,7 +165,43 @@ public class GenerateRorActionSetJob {
|
|||
o.setDataInfo(ROR_DATA_INFO);
|
||||
o.setLastupdatetimestamp(now.getTime());
|
||||
|
||||
return o;
|
||||
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
|
||||
res.add(new AtomicAction<>(Organization.class, o));
|
||||
|
||||
for (final Relationship rorRel : r.getRelationships()) {
|
||||
if (rorRel.getType().equalsIgnoreCase("parent")) {
|
||||
final String orgId1 = calculateOpenaireId(r.getId());
|
||||
final String orgId2 = calculateOpenaireId(rorRel.getId());
|
||||
res
|
||||
.add(
|
||||
new AtomicAction<>(Relation.class,
|
||||
calculateHierarchyRel(orgId1, orgId2, ModelConstants.IS_PARENT_OF)));
|
||||
res
|
||||
.add(
|
||||
new AtomicAction<>(Relation.class,
|
||||
calculateHierarchyRel(orgId2, orgId1, ModelConstants.IS_CHILD_OF)));
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
|
||||
}
|
||||
|
||||
private static Relation calculateHierarchyRel(final String source, final String target, final String relClass) {
|
||||
final Relation rel = new Relation();
|
||||
rel.setSource(source);
|
||||
rel.setTarget(target);
|
||||
rel.setRelType(ORG_ORG_RELTYPE);
|
||||
rel.setSubRelType(ModelConstants.RELATIONSHIP);
|
||||
rel.setRelClass(relClass);
|
||||
rel.setCollectedfrom(ROR_COLLECTED_FROM);
|
||||
rel.setDataInfo(ROR_DATA_INFO);
|
||||
rel.setLastupdatetimestamp(System.currentTimeMillis());
|
||||
return rel;
|
||||
}
|
||||
|
||||
private static String calculateOpenaireId(final String rorId) {
|
||||
return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId));
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> pids(final RorOrganization r) {
|
||||
|
@ -202,14 +237,14 @@ public class GenerateRorActionSetJob {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Dataset<RorOrganization> readInputPath(
|
||||
private static JavaRDD<RorOrganization> readInputPath(
|
||||
final SparkSession spark,
|
||||
final String path) throws IOException {
|
||||
|
||||
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
|
||||
final InputStream is = fileSystem.open(new Path(path))) {
|
||||
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
|
||||
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
|
||||
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)).toJavaRDD();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
/**
|
||||
* created the Atomic Action for each type of results
|
||||
*/
|
||||
public class SparkAtomicActionUsageJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkAtomicActionUsageJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
final String dbname = parser.get("usagestatsdb");
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareResults(dbname, spark, workingPath);
|
||||
prepareActionSet(spark, workingPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
public static void prepareResults(String db, SparkSession spark, String workingPath) {
|
||||
spark
|
||||
.sql(
|
||||
"Select result_id, downloads, views " +
|
||||
"from " + db + ".usage_stats")
|
||||
.as(Encoders.bean(UsageStatsModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
}
|
||||
|
||||
public static void prepareActionSet(SparkSession spark, String inputPath, String outputPath) {
|
||||
readPath(spark, inputPath, UsageStatsModel.class)
|
||||
.groupByKey((MapFunction<UsageStatsModel, String>) us -> us.getResult_id(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, UsageStatsModel, Result>) (k, it) -> {
|
||||
UsageStatsModel first = it.next();
|
||||
it.forEachRemaining(us -> {
|
||||
first.setDownloads(first.getDownloads() + us.getDownloads());
|
||||
first.setViews(first.getViews() + us.getViews());
|
||||
});
|
||||
|
||||
Result res = new Result();
|
||||
res.setId("50|" + k);
|
||||
|
||||
res.setMeasures(getMeasure(first.getDownloads(), first.getViews()));
|
||||
return res;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static List<Measure> getMeasure(Long downloads, Long views) {
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"");
|
||||
|
||||
return Arrays
|
||||
.asList(
|
||||
OafMapperUtils
|
||||
.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
|
||||
OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));
|
||||
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class UsageStatsModel implements Serializable {
|
||||
private String result_id;
|
||||
private Long downloads;
|
||||
private Long views;
|
||||
|
||||
public String getResult_id() {
|
||||
return result_id;
|
||||
}
|
||||
|
||||
public void setResult_id(String result_id) {
|
||||
this.result_id = result_id;
|
||||
}
|
||||
|
||||
public Long getDownloads() {
|
||||
return downloads;
|
||||
}
|
||||
|
||||
public void setDownloads(Long downloads) {
|
||||
this.downloads = downloads;
|
||||
}
|
||||
|
||||
public Long getViews() {
|
||||
return views;
|
||||
}
|
||||
|
||||
public void setViews(Long views) {
|
||||
this.views = views;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "delimiter",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "if",
|
||||
"paramLongName": "inputFile",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -26,6 +26,7 @@
|
|||
<switch>
|
||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
|
||||
<case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
|
||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -60,6 +61,32 @@
|
|||
<arg>--inputFile</arg><arg>${inputFile}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="read">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the AS for OC</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
|
||||
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON/</arg>
|
||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
|
||||
</spark>
|
||||
<ok to="create_actionset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -81,7 +108,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
|
||||
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "hmu",
|
||||
"paramLongName": "hive_metastore_uris",
|
||||
"paramDescription": "the URI for the hive metastore",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sdb",
|
||||
"paramLongName": "usagestatsdb",
|
||||
"paramDescription": "the name of the db to be used",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workingPath where to save the content of the usage_stats table",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="UsageStatsCounts" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>usagestatsdb</name>
|
||||
<description>the name of the db to be used</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="atomicactions"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="atomicactions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the usage stats count for results</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--usagestatsdb</arg><arg>${usagestatsdb}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/usageDb</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,63 @@
|
|||
from urllib.request import urlopen
|
||||
import json
|
||||
|
||||
|
||||
def retrieve_datacite_clients(base_url):
|
||||
datacite_clients = {}
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
|
||||
base_url = data['links']['next']
|
||||
else:
|
||||
base_url = None
|
||||
return datacite_clients
|
||||
|
||||
|
||||
def retrieve_r3data(start_url):
|
||||
r3data_clients = {}
|
||||
page_number = 1
|
||||
base_url = start_url
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
r3data_clients[item['id'].lower()]= dict(
|
||||
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
|
||||
official_name=item['attributes']['repositoryName']
|
||||
)
|
||||
page_number +=1
|
||||
base_url = f"{start_url}&page[number]={page_number}"
|
||||
else:
|
||||
base_url = None
|
||||
return r3data_clients
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
|
||||
|
||||
dc = retrieve_datacite_clients(base_url)
|
||||
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
|
||||
|
||||
result = {}
|
||||
|
||||
for item in dc:
|
||||
res = dc[item].lower()
|
||||
if res not in r3:
|
||||
print(f"missing {res} for {item} in dictionary")
|
||||
else:
|
||||
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
|
||||
|
||||
|
||||
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
|
||||
json.dump(result, json_file, ensure_ascii=False, indent=1)
|
File diff suppressed because it is too large
Load Diff
|
@ -49,7 +49,7 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
}
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||
val timeout = 60; // seconds
|
||||
val timeout = 600; // seconds
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
|
|
|
@ -46,7 +46,7 @@ object ImportDatacite {
|
|||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
|
||||
"/eu/dnetlib/dhp/datacite/import_from_api.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
|
|
|
@ -146,6 +146,11 @@ public class PrepareTest {
|
|||
.get(0)
|
||||
.getValue());
|
||||
|
||||
final String doi2 = "unresolved::10.3390/s18072310::doi";
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count());
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -259,59 +264,61 @@ public class PrepareTest {
|
|||
.collect()
|
||||
.contains("8. Economic growth"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void test3() throws Exception {
|
||||
final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
|
||||
|
||||
final String outputPath = workingDir.toString() + "/fos.json";
|
||||
GetFOSSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", sourcePath,
|
||||
|
||||
"-outputPath", outputPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<FOSDataModel> tmp = sc
|
||||
.textFile(outputPath)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
|
||||
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
||||
Assertions.assertEquals(32, tmp.filter(row -> row.getDataInfo() != null).count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void test4() throws Exception {
|
||||
final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
|
||||
|
||||
final String outputPath = workingDir.toString() + "/sdg.json";
|
||||
GetSDGSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", sourcePath,
|
||||
|
||||
"-outputPath", outputPath
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<SDGDataModel> tmp = sc
|
||||
.textFile(outputPath)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
|
||||
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
|
||||
|
||||
}
|
||||
// @Test
|
||||
// void test3() throws Exception {
|
||||
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
|
||||
//
|
||||
// final String outputPath = workingDir.toString() + "/fos.json";
|
||||
// GetFOSSparkJob
|
||||
// .main(
|
||||
// new String[] {
|
||||
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
// "--sourcePath", sourcePath,
|
||||
//
|
||||
// "-outputPath", outputPath
|
||||
//
|
||||
// });
|
||||
//
|
||||
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
//
|
||||
// JavaRDD<FOSDataModel> tmp = sc
|
||||
// .textFile(outputPath)
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
|
||||
//
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// void test4() throws Exception {
|
||||
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
|
||||
//
|
||||
// final String outputPath = workingDir.toString() + "/sdg.json";
|
||||
// GetSDGSparkJob
|
||||
// .main(
|
||||
// new String[] {
|
||||
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
// "--sourcePath", sourcePath,
|
||||
//
|
||||
// "-outputPath", outputPath
|
||||
//
|
||||
// });
|
||||
//
|
||||
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
//
|
||||
// JavaRDD<SDGDataModel> tmp = sc
|
||||
// .textFile(outputPath)
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
|
||||
//
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
|
||||
//
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -196,6 +196,9 @@ public class ProduceTest {
|
|||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, tmp
|
||||
|
|
|
@ -76,7 +76,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -99,7 +99,7 @@ public class CreateOpenCitationsASTest {
|
|||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
assertEquals(60, tmp.count());
|
||||
assertEquals(62, tmp.count());
|
||||
|
||||
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
|
||||
|
@ -110,7 +110,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -131,7 +131,7 @@ public class CreateOpenCitationsASTest {
|
|||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
assertEquals(44, tmp.count());
|
||||
assertEquals(46, tmp.count());
|
||||
|
||||
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
|
||||
|
@ -142,7 +142,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -175,7 +175,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -215,7 +215,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -240,8 +240,8 @@ public class CreateOpenCitationsASTest {
|
|||
assertEquals("citation", r.getSubRelType());
|
||||
assertEquals("resultResult", r.getRelType());
|
||||
});
|
||||
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
|
||||
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
|
||||
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
|
||||
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
|
||||
|
||||
}
|
||||
|
||||
|
@ -250,7 +250,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -295,7 +295,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
|
|
@ -0,0 +1,138 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class ReadCOCITest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(ReadCOCITest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(ReadCOCITest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(ReadCOCITest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ReadCOCITest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testReadCOCI() throws Exception {
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
.getPath();
|
||||
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
|
||||
|
||||
ReadCOCI
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-workingPath",
|
||||
workingDir.toString() + "/COCI",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/COCI_json/",
|
||||
"-inputFile", "input1;input2;input3;input4;input5"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<COCI> tmp = sc
|
||||
.textFile(workingDir.toString() + "/COCI_json/*/")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
|
||||
|
||||
Assertions.assertEquals(24, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());
|
||||
|
||||
Assertions.assertEquals(8, tmp.filter(c -> c.getCiting().indexOf(".refs") > -1).count());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -13,9 +16,12 @@ import org.junit.jupiter.api.Test;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
@Disabled
|
||||
class GenerateRorActionSetJobTest {
|
||||
|
||||
private static final ObjectMapper mapper = new ObjectMapper();
|
||||
|
@ -30,21 +36,40 @@ class GenerateRorActionSetJobTest {
|
|||
void testConvertRorOrg() throws Exception {
|
||||
final RorOrganization r = mapper
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
|
||||
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
|
||||
Assertions.assertEquals(3, aas.size());
|
||||
assertEquals(Organization.class, aas.get(0).getClazz());
|
||||
assertEquals(Relation.class, aas.get(1).getClazz());
|
||||
assertEquals(Relation.class, aas.get(2).getClazz());
|
||||
|
||||
final Organization o = (Organization) aas.get(0).getPayload();
|
||||
final Relation r1 = (Relation) aas.get(1).getPayload();
|
||||
final Relation r2 = (Relation) aas.get(2).getPayload();
|
||||
|
||||
assertEquals(o.getId(), r1.getSource());
|
||||
assertEquals(r1.getSource(), r2.getTarget());
|
||||
assertEquals(r2.getSource(), r1.getTarget());
|
||||
assertEquals(ModelConstants.IS_PARENT_OF, r1.getRelClass());
|
||||
assertEquals(ModelConstants.IS_CHILD_OF, r2.getRelClass());
|
||||
|
||||
System.out.println(mapper.writeValueAsString(o));
|
||||
System.out.println(mapper.writeValueAsString(r1));
|
||||
System.out.println(mapper.writeValueAsString(r2));
|
||||
|
||||
final String s = mapper.writeValueAsString(org);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(s));
|
||||
System.out.println(s);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void testConvertAllRorOrg() throws Exception {
|
||||
final RorOrganization[] arr = mapper
|
||||
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
|
||||
|
||||
for (final RorOrganization r : arr) {
|
||||
Organization o = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
Assertions.assertNotNull(o);
|
||||
final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
Assertions.assertFalse(aas.isEmpty());
|
||||
Assertions.assertNotNull(aas.get(0));
|
||||
final Organization o = (Organization) aas.get(0).getPayload();
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(o.getId()));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,259 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.usagestats;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkAtomicActionCountJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SparkAtomicActionCountJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SparkAtomicActionCountJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SparkAtomicActionCountJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkAtomicActionCountJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMatch() {
|
||||
String usageScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/usagestats/usagestatsdb")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionUsageJob.prepareActionSet(spark, usageScoresPath, workingDir.toString() + "/actionSet");
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Result> tmp = sc
|
||||
.textFile(workingDir.toString() + "/actionSet")
|
||||
.map(usm -> OBJECT_MAPPER.readValue(usm, Result.class));
|
||||
|
||||
Assertions.assertEquals(9, tmp.count());
|
||||
|
||||
tmp.foreach(r -> Assertions.assertEquals(2, r.getMeasures().size()));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getDeletedbyinference()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m.getUnit().stream().forEach(u -> Assertions.assertTrue(u.getDataInfo().getInferred()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(u -> Assertions.assertFalse(u.getDataInfo().getInvisible()))));
|
||||
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"measure:usage_counts",
|
||||
u.getDataInfo().getProvenanceaction().getClassid()))));
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"Inferred by OpenAIRE",
|
||||
u.getDataInfo().getProvenanceaction().getClassname()))));
|
||||
|
||||
tmp
|
||||
.foreach(
|
||||
r -> r
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.forEach(
|
||||
m -> m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.forEach(
|
||||
u -> Assertions
|
||||
.assertEquals(
|
||||
"count",
|
||||
u.getKey()))));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, tmp.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6")).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"5",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::17eda2ff77407538fbe5d3d719b9d1c0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"2",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("downloads"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"6",
|
||||
tmp
|
||||
.filter(r -> r.getId().equals("50|doi_________::3085e4c6e051378ca6157fe7f0430c1f"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(m -> m.getId().equals("views"))
|
||||
.collect(Collectors.toList())
|
||||
.get(0)
|
||||
.getUnit()
|
||||
.get(0)
|
||||
.getValue());
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,8 +0,0 @@
|
|||
oci,citing,cited,creation,timespan,journal_sc,author_sc
|
||||
02001000007362801000805046300010563030608046333-0200101010136193701050501630209010637020000083700020400083733,10.1007/s10854-015-3684-x,10.1111/j.1551-2916.2008.02408.x,2015-09-01,P7Y2M,no,no
|
||||
02001000007362801000805046300010563030608046333-02001000007362801000805046300010463020101046309,10.1007/s10854-015-3684-x,10.1007/s10854-014-2114-9,2015-09-01,P1Y2M4D,yes,no
|
||||
02001000007362801000805046300010563030608046333-020010001063619371214271022182329370200010337000937000609,10.1007/s10854-015-3684-x,10.1016/j.ceramint.2013.09.069,2015-09-01,P1Y6M,no,no
|
||||
02001000007362801000805046300010563030608046333-02001000007362801000805046300000963090901036304,10.1007/s10854-015-3684-x,10.1007/s10854-009-9913-4,2015-09-01,P6Y3M10D,yes,no
|
||||
02001000007362801000805046300010563030608046333-02001000106360000030863010009085807025909000307006305,10.1007/s10854-015-3684-x,10.1016/0038-1098(72)90370-5,2015-09-01,P43Y8M,no,no
|
||||
02001000007362801000805046300010563030608056309-02001000106361937281010370200010437000937000308,10.1007/s10854-015-3685-9,10.1016/j.saa.2014.09.038,2015-09-03,P0Y7M,no,no
|
||||
02001000007362801000805046300010563030608056309-0200100010636193722102912171027370200010537000437000106,10.1007/s10854-015-3685-9,10.1016/j.matchar.2015.04.016,2015-09-03,P0Y2M,no,no
|
Binary file not shown.
|
@ -1,8 +0,0 @@
|
|||
oci,citing,cited,creation,timespan,journal_sc,author_sc
|
||||
02001000308362804010509076300010963000003086301-0200100020936020001003227000009010004,10.1038/s41597-019-0038-1,10.1029/2010wr009104,2019-04-15,P8Y1M,no,no
|
||||
02001000308362804010509076300010963000003086301-0200100010636280103060463080105025800015900000006006303,10.1038/s41597-019-0038-1,10.1016/s1364-8152(01)00060-3,2019-04-15,P17Y3M,no,no
|
||||
02001000308362804010509076300010963000003086301-02001000007362800000407076300010063000401066333,10.1038/s41597-019-0038-1,10.1007/s00477-010-0416-x,2019-04-15,P8Y9M6D,no,no
|
||||
02001000308362804010509076300010963000003086301-02001000007362800000700046300010363000905016308,10.1038/s41597-019-0038-1,10.1007/s00704-013-0951-8,2019-04-15,P5Y9M23D,no,no
|
||||
02001000308362804010509076300010963000003086301-02001000002361924123705070707,10.1038/s41597-019-0038-1,10.1002/joc.5777,2019-04-15,P0Y8M1D,no,no
|
||||
02001000308362804010509076300010963000003086301-02005010904361714282863020263040504076302000108,10.1038/s41597-019-0038-1,10.5194/hess-22-4547-2018,2019-04-15,P0Y7M18D,no,no
|
||||
02001000308362804010509076300010963000003086301-02001000002361924123703050404,10.1038/s41597-019-0038-1,10.1002/joc.3544,2019-04-15,P6Y9M6D,no,no
|
Binary file not shown.
|
@ -1,9 +0,0 @@
|
|||
oci,citing,cited,creation,timespan,journal_sc,author_sc
|
||||
0200100000236090708010101090307000202023727141528-020050302063600040000010307,10.1002/9781119370222.refs,10.5326/0400137,2020-06-22,P16Y3M,no,no
|
||||
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020000073700000301093733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2007.00319.x,2020-06-22,P12Y8M,no,no
|
||||
0200100000236090708010101090307000202023727141528-0200101010136312830370102030509,10.1002/9781119370222.refs,10.1111/vsu.12359,2020-06-22,P4Y10M29D,no,no
|
||||
0200100000236090708010101090307000202023727141528-020050302063600030900020904,10.1002/9781119370222.refs,10.5326/0390294,2020-06-22,P17Y1M,no,no
|
||||
0200100000236090708010101090307000202023727141528-020050302063600040200030701,10.1002/9781119370222.refs,10.5326/0420371,2020-06-22,P13Y9M,no,no
|
||||
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020001033701020000003733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2013.12000.x,2020-06-22,P7Y2M,no,no
|
||||
0200100000236090708010101090307000202023727141528-020010008003600000408000106093702000006370306070200,10.1002/9781119370222.refs,10.1080/00480169.2006.36720,2020-06-22,P13Y6M,no,no
|
||||
0200100000236090708010101090307000202023727141528-0200101010136193701070501630008010337020000063700000003033733,10.1002/9781119370222.refs,10.1111/j.1751-0813.2006.00033.x,2020-06-22,P13Y8M,no,no
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -2,23 +2,23 @@
|
|||
"ip_addresses": [],
|
||||
"aliases": [],
|
||||
"acronyms": [
|
||||
"ANU"
|
||||
"MSO"
|
||||
],
|
||||
"links": [
|
||||
"http://www.anu.edu.au/"
|
||||
"https://rsaa.anu.edu.au/observatories/mount-stromlo-observatory"
|
||||
],
|
||||
"country": {
|
||||
"country_code": "AU",
|
||||
"country_name": "Australia"
|
||||
},
|
||||
"name": "Australian National University",
|
||||
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
|
||||
"name": "Mount Stromlo Observatory",
|
||||
"wikipedia_url": "https://en.wikipedia.org/wiki/Mount_Stromlo_Observatory",
|
||||
"addresses": [
|
||||
{
|
||||
"lat": -35.2778,
|
||||
"lat": -35.320278,
|
||||
"state_code": "AU-ACT",
|
||||
"country_geonames_id": 2077456,
|
||||
"lng": 149.1205,
|
||||
"lng": 149.006944,
|
||||
"state": "Australian Capital Territory",
|
||||
"city": "Canberra",
|
||||
"geonames_city": {
|
||||
|
@ -61,63 +61,34 @@
|
|||
"types": [
|
||||
"Education"
|
||||
],
|
||||
"established": 1946,
|
||||
"established": 1924,
|
||||
"relationships": [
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/041c7s516",
|
||||
"label": "Calvary Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/04h7nbn38",
|
||||
"label": "Canberra Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/030jpqj15",
|
||||
"label": "Goulburn Base Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Child",
|
||||
"id": "https://ror.org/006a4jj40",
|
||||
"label": "Mount Stromlo Observatory"
|
||||
"type": "Parent",
|
||||
"id": "https://ror.org/019wvm592",
|
||||
"label": "Australian National University"
|
||||
}
|
||||
],
|
||||
"email_address": null,
|
||||
"external_ids": {
|
||||
"Wikidata": {
|
||||
"all": [
|
||||
"Q127990"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"OrgRef": {
|
||||
"all": [
|
||||
"285106"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"ISNI": {
|
||||
"all": [
|
||||
"0000 0001 2180 7477"
|
||||
"0000 0004 0459 2816"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"FundRef": {
|
||||
"Wikidata": {
|
||||
"all": [
|
||||
"501100000995",
|
||||
"501100001151",
|
||||
"100009020"
|
||||
"Q1310548"
|
||||
],
|
||||
"preferred": "501100000995"
|
||||
"preferred": null
|
||||
},
|
||||
"GRID": {
|
||||
"all": "grid.1001.0",
|
||||
"preferred": "grid.1001.0"
|
||||
"all": "grid.440325.4",
|
||||
"preferred": "grid.440325.4"
|
||||
}
|
||||
},
|
||||
"id": "https://ror.org/019wvm592",
|
||||
"id": "https://ror.org/006a4jj40",
|
||||
"labels": [],
|
||||
"status": "active"
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":4}
|
||||
{"result_id":"dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::17eda2ff77407538fbe5d3d719b9d1c0","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::1d4dc08605fd0a2be1105d30c63bfea1","downloads":1,"views":3}
|
||||
{"result_id":"doi_________::2e3527822854ca9816f6dfea5bff61a8","downloads":1,"views":1}
|
||||
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":2,"views":3}
|
||||
{"result_id":"doi_________::3085e4c6e051378ca6157fe7f0430c1f","downloads":0,"views":3}
|
||||
{"result_id":"doi_________::33f710e6dd30cc5e67e35b371ddc33cf","downloads":0,"views":1}
|
||||
{"result_id":"doi_________::39738ebf10654732dd3a7af9f24655f8","downloads":1,"views":3}
|
||||
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":1,"views":8}
|
||||
{"result_id":"doi_________::3c3b65f07c1a06c7894397eda1d11bbf","downloads":0,"views":2}
|
||||
{"result_id":"doi_________::4938a71a884dd481d329657aa543b850","downloads":0,"views":3}
|
|
@ -70,6 +70,8 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
|
|||
|
||||
assertEquals(100, nativeSize)
|
||||
|
||||
spark.read.load(targetPath).printSchema();
|
||||
|
||||
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
|
||||
|
||||
result
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,192 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.ConditionParams;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.model.MappedFields;
|
||||
import eu.dnetlib.dhp.broker.model.Notification;
|
||||
import eu.dnetlib.dhp.broker.model.Subscription;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
|
||||
|
||||
public class GenerateNotificationsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateNotificationsJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateNotificationsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_notifications.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String notificationsPath = parser.get("outputDir") + "/notifications";
|
||||
log.info("notificationsPath: {}", notificationsPath);
|
||||
|
||||
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
|
||||
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_notifications");
|
||||
|
||||
final long startTime = new Date().getTime();
|
||||
|
||||
final List<Subscription> subscriptions = listSubscriptions(brokerApiBaseUrl);
|
||||
|
||||
log.info("Number of subscriptions: " + subscriptions.size());
|
||||
|
||||
if (subscriptions.size() > 0) {
|
||||
final Map<String, Map<String, List<ConditionParams>>> conditionsMap = prepareConditionsMap(subscriptions);
|
||||
|
||||
log.info("ConditionsMap: " + new ObjectMapper().writeValueAsString(conditionsMap));
|
||||
|
||||
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
|
||||
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
|
||||
final Dataset<Notification> notifications = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(
|
||||
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(
|
||||
e, subscriptions, conditionsMap, startTime),
|
||||
ngEncoder)
|
||||
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
|
||||
|
||||
ClusterUtils.save(notifications, notificationsPath, Notification.class, total);
|
||||
}
|
||||
}
|
||||
|
||||
protected static Map<String, Map<String, List<ConditionParams>>> prepareConditionsMap(
|
||||
final List<Subscription> subscriptions) {
|
||||
final Map<String, Map<String, List<ConditionParams>>> map = new HashMap<>();
|
||||
subscriptions.forEach(s -> map.put(s.getSubscriptionId(), s.conditionsAsMap()));
|
||||
return map;
|
||||
}
|
||||
|
||||
protected static NotificationGroup generateNotifications(final Event e,
|
||||
final List<Subscription> subscriptions,
|
||||
final Map<String, Map<String, List<ConditionParams>>> conditionsMap,
|
||||
final long date) {
|
||||
final List<Notification> list = subscriptions
|
||||
.stream()
|
||||
.filter(
|
||||
s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
|
||||
.filter(s -> verifyConditions(e.getMap(), conditionsMap.get(s.getSubscriptionId())))
|
||||
.map(s -> generateNotification(s, e, date))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new NotificationGroup(list);
|
||||
}
|
||||
|
||||
private static Notification generateNotification(final Subscription s, final Event e, final long date) {
|
||||
final Notification n = new Notification();
|
||||
n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId()));
|
||||
n.setSubscriptionId(s.getSubscriptionId());
|
||||
n.setEventId(e.getEventId());
|
||||
n.setProducerId(e.getProducerId());
|
||||
n.setTopic(e.getTopic());
|
||||
n.setPayload(e.getPayload());
|
||||
n.setMap(e.getMap());
|
||||
n.setDate(date);
|
||||
return n;
|
||||
}
|
||||
|
||||
private static boolean verifyConditions(final MappedFields map,
|
||||
final Map<String, List<ConditionParams>> conditions) {
|
||||
if (conditions.containsKey("targetDatasourceName")
|
||||
&& !SubscriptionUtils
|
||||
.verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("trust")
|
||||
&& !SubscriptionUtils
|
||||
.verifyFloatRange(
|
||||
map.getTrust(), conditions.get("trust").get(0).getValue(),
|
||||
conditions.get("trust").get(0).getOtherValue())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetDateofacceptance") && !conditions
|
||||
.get("targetDateofacceptance")
|
||||
.stream()
|
||||
.anyMatch(
|
||||
c -> SubscriptionUtils
|
||||
.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetResultTitle")
|
||||
&& !conditions
|
||||
.get("targetResultTitle")
|
||||
.stream()
|
||||
.anyMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetAuthors")
|
||||
&& !conditions
|
||||
.get("targetAuthors")
|
||||
.stream()
|
||||
.allMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !conditions.containsKey("targetSubjects")
|
||||
|| conditions
|
||||
.get("targetSubjects")
|
||||
.stream()
|
||||
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
|
||||
|
||||
}
|
||||
|
||||
private static List<Subscription> listSubscriptions(final String brokerApiBaseUrl) throws Exception {
|
||||
final String url = brokerApiBaseUrl + "/api/subscriptions";
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String s = IOUtils.toString(response.getEntity().getContent());
|
||||
return mapper
|
||||
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -2,15 +2,10 @@
|
|||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpDelete;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
|
@ -18,10 +13,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
@ -33,10 +25,8 @@ import com.fasterxml.jackson.core.JsonProcessingException;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.*;
|
||||
import eu.dnetlib.dhp.broker.model.Notification;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
|
||||
|
||||
public class IndexNotificationsJob {
|
||||
|
||||
|
@ -53,8 +43,8 @@ public class IndexNotificationsJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
final String notificationsPath = parser.get("outputDir") + "/notifications";
|
||||
log.info("notificationsPath: {}", notificationsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
log.info("index: {}", index);
|
||||
|
@ -81,23 +71,13 @@ public class IndexNotificationsJob {
|
|||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_indexed");
|
||||
|
||||
final long startTime = new Date().getTime();
|
||||
final Long date = ClusterUtils
|
||||
.readPath(spark, notificationsPath, Notification.class)
|
||||
.first()
|
||||
.getDate();
|
||||
|
||||
final List<Subscription> subscriptions = listSubscriptions(brokerApiBaseUrl);
|
||||
|
||||
log.info("Number of subscriptions: {}", subscriptions.size());
|
||||
|
||||
if (!subscriptions.isEmpty()) {
|
||||
final Encoder<NotificationGroup> ngEncoder = Encoders.bean(NotificationGroup.class);
|
||||
final Encoder<Notification> nEncoder = Encoders.bean(Notification.class);
|
||||
final Dataset<Notification> notifications = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(
|
||||
(MapFunction<Event, NotificationGroup>) e -> generateNotifications(e, subscriptions, startTime),
|
||||
ngEncoder)
|
||||
.flatMap((FlatMapFunction<NotificationGroup, Notification>) g -> g.getData().iterator(), nEncoder);
|
||||
|
||||
final JavaRDD<String> inputRdd = notifications
|
||||
final JavaRDD<String> toIndexRdd = ClusterUtils
|
||||
.readPath(spark, notificationsPath, Notification.class)
|
||||
.map((MapFunction<Notification, String>) n -> prepareForIndexing(n, total), Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
|
@ -112,112 +92,20 @@ public class IndexNotificationsJob {
|
|||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
log.info("*** Start indexing");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
JavaEsSpark.saveJsonToEs(toIndexRdd, index, esCfg);
|
||||
log.info("*** End indexing");
|
||||
|
||||
log.info("*** Deleting old notifications");
|
||||
final String message = deleteOldNotifications(brokerApiBaseUrl, startTime - 1000);
|
||||
final String message = deleteOldNotifications(brokerApiBaseUrl, date - 1000);
|
||||
log.info("*** Deleted notifications: {}", message);
|
||||
|
||||
log.info("*** sendNotifications (emails, ...)");
|
||||
sendNotifications(brokerApiBaseUrl, startTime - 1000);
|
||||
sendNotifications(brokerApiBaseUrl, date - 1000);
|
||||
log.info("*** ALL done.");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static NotificationGroup generateNotifications(final Event e,
|
||||
final List<Subscription> subscriptions,
|
||||
final long date) {
|
||||
final List<Notification> list = subscriptions
|
||||
.stream()
|
||||
.filter(
|
||||
s -> StringUtils.isBlank(s.getTopic()) || s.getTopic().equals("*") || s.getTopic().equals(e.getTopic()))
|
||||
.filter(s -> verifyConditions(e.getMap(), s.conditionsAsMap()))
|
||||
.map(s -> generateNotification(s, e, date))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new NotificationGroup(list);
|
||||
}
|
||||
|
||||
private static Notification generateNotification(final Subscription s, final Event e, final long date) {
|
||||
final Notification n = new Notification();
|
||||
n.setNotificationId("ntf-" + DigestUtils.md5Hex(s.getSubscriptionId() + "@@@" + e.getEventId()));
|
||||
n.setSubscriptionId(s.getSubscriptionId());
|
||||
n.setEventId(e.getEventId());
|
||||
n.setProducerId(e.getProducerId());
|
||||
n.setTopic(e.getTopic());
|
||||
n.setPayload(e.getPayload());
|
||||
n.setMap(e.getMap());
|
||||
n.setDate(date);
|
||||
return n;
|
||||
}
|
||||
|
||||
private static boolean verifyConditions(final MappedFields map,
|
||||
final Map<String, List<ConditionParams>> conditions) {
|
||||
if (conditions.containsKey("targetDatasourceName")
|
||||
&& !SubscriptionUtils
|
||||
.verifyExact(map.getTargetDatasourceName(), conditions.get("targetDatasourceName").get(0).getValue())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("trust")
|
||||
&& !SubscriptionUtils
|
||||
.verifyFloatRange(
|
||||
map.getTrust(), conditions.get("trust").get(0).getValue(),
|
||||
conditions.get("trust").get(0).getOtherValue())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetDateofacceptance") && conditions
|
||||
.get("targetDateofacceptance")
|
||||
.stream()
|
||||
.noneMatch(
|
||||
c -> SubscriptionUtils
|
||||
.verifyDateRange(map.getTargetDateofacceptance(), c.getValue(), c.getOtherValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetResultTitle")
|
||||
&& conditions
|
||||
.get("targetResultTitle")
|
||||
.stream()
|
||||
.noneMatch(c -> SubscriptionUtils.verifySimilar(map.getTargetResultTitle(), c.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (conditions.containsKey("targetAuthors")
|
||||
&& conditions
|
||||
.get("targetAuthors")
|
||||
.stream()
|
||||
.noneMatch(c -> SubscriptionUtils.verifyListSimilar(map.getTargetAuthors(), c.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !conditions.containsKey("targetSubjects")
|
||||
|| conditions
|
||||
.get("targetSubjects")
|
||||
.stream()
|
||||
.allMatch(c -> SubscriptionUtils.verifyListExact(map.getTargetSubjects(), c.getValue()));
|
||||
|
||||
}
|
||||
|
||||
private static List<Subscription> listSubscriptions(final String brokerApiBaseUrl) throws IOException {
|
||||
final String url = brokerApiBaseUrl + "/api/subscriptions";
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String s = IOUtils.toString(response.getEntity().getContent());
|
||||
return mapper
|
||||
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, Subscription.class));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws IOException {
|
||||
private static String deleteOldNotifications(final String brokerApiBaseUrl, final long l) throws Exception {
|
||||
final String url = brokerApiBaseUrl + "/api/notifications/byDate/0/" + l;
|
||||
final HttpDelete req = new HttpDelete(url);
|
||||
|
||||
|
|
|
@ -115,6 +115,11 @@
|
|||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkMaxExecutorsForIndexing</name>
|
||||
<value>8</value>
|
||||
<description>Max number of workers for ElasticSearch indexing</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -498,7 +503,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -542,6 +547,30 @@
|
|||
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="generate_notifications"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="generate_notifications">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateNotificationsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="index_notifications"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -556,7 +585,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the dir that contains the events folder",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "broker",
|
||||
"paramLongName": "brokerApiBaseUrl",
|
||||
"paramDescription": "the url of the broker service api",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -98,6 +98,11 @@
|
|||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkMaxExecutorsForIndexing</name>
|
||||
<value>8</value>
|
||||
<description>Max number of workers for ElasticSearch indexing</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -119,12 +124,36 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="index_notifications"/>
|
||||
<start to="generate_notifications"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="generate_notifications">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateNotificationsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateNotificationsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="index_notifications"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="index_notifications">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -135,7 +164,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -75,6 +75,11 @@
|
|||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkMaxExecutorsForIndexing</name>
|
||||
<value>8</value>
|
||||
<description>Max number of workers for ElasticSearch indexing</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -112,7 +117,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.ConditionParams;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.model.MappedFields;
|
||||
import eu.dnetlib.dhp.broker.model.Subscription;
|
||||
import eu.dnetlib.dhp.broker.oa.util.NotificationGroup;
|
||||
|
||||
class GenerateNotificationsJobTest {
|
||||
|
||||
private List<Subscription> subscriptions;
|
||||
|
||||
private Map<String, Map<String, List<ConditionParams>>> conditionsMap;
|
||||
|
||||
private static final int N_TIMES = 1_000_000;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
final Subscription s = new Subscription();
|
||||
s.setTopic("ENRICH/MISSING/PID");
|
||||
s
|
||||
.setConditions(
|
||||
"[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]");
|
||||
subscriptions = Arrays.asList(s);
|
||||
conditionsMap = GenerateNotificationsJob.prepareConditionsMap(subscriptions);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_invalid_topic() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PROJECT");
|
||||
|
||||
final NotificationGroup res = GenerateNotificationsJob
|
||||
.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
assertEquals(0, res.getData().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_topic_match() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PID");
|
||||
event.setMap(new MappedFields());
|
||||
event.getMap().setTargetDatasourceName("reposiTUm");
|
||||
event.getMap().setTrust(0.8f);
|
||||
|
||||
final NotificationGroup res = GenerateNotificationsJob
|
||||
.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
assertEquals(1, res.getData().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_topic_no_match() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PID");
|
||||
event.setMap(new MappedFields());
|
||||
event.getMap().setTargetDatasourceName("Puma");
|
||||
event.getMap().setTrust(0.8f);
|
||||
|
||||
final NotificationGroup res = GenerateNotificationsJob
|
||||
.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
assertEquals(0, res.getData().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_invalid_topic_repeated() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PROJECT");
|
||||
|
||||
// warm up
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
|
||||
final long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < N_TIMES; i++) {
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
}
|
||||
final long end = System.currentTimeMillis();
|
||||
System.out
|
||||
.println(String.format("no topic - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_topic_match_repeated() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PID");
|
||||
event.setMap(new MappedFields());
|
||||
event.getMap().setTargetDatasourceName("reposiTUm");
|
||||
event.getMap().setTrust(0.8f);
|
||||
|
||||
// warm up
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
|
||||
final long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < N_TIMES; i++) {
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
}
|
||||
final long end = System.currentTimeMillis();
|
||||
System.out
|
||||
.println(String.format("topic match - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGenerateNotifications_topic_no_match_repeated() {
|
||||
final Event event = new Event();
|
||||
event.setTopic("ENRICH/MISSING/PID");
|
||||
event.setMap(new MappedFields());
|
||||
event.getMap().setTargetDatasourceName("Puma");
|
||||
event.getMap().setTrust(0.8f);
|
||||
|
||||
// warm up
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
|
||||
final long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < N_TIMES; i++) {
|
||||
GenerateNotificationsJob.generateNotifications(event, subscriptions, conditionsMap, 0);
|
||||
}
|
||||
final long end = System.currentTimeMillis();
|
||||
System.out
|
||||
.println(
|
||||
String.format("topic no match - repeated %s times - execution time: %s ms ", N_TIMES, end - start));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.samples;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonParseException;
|
||||
import com.fasterxml.jackson.databind.JsonMappingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.ConditionParams;
|
||||
import eu.dnetlib.dhp.broker.model.MapCondition;
|
||||
import eu.dnetlib.dhp.broker.oa.util.SubscriptionUtils;
|
||||
|
||||
@Disabled
|
||||
public class SimpleVariableJobTest {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SimpleVariableJobTest.class);
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private final static List<String> inputList = new ArrayList<>();
|
||||
|
||||
private static final Map<String, Map<String, List<ConditionParams>>> staticMap = new HashMap<>();
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
||||
workingDir = Files.createTempDirectory(SimpleVariableJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SimpleVariableJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
// conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
// conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SimpleVariableJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
|
||||
for (int i = 0; i < 1_000_000; i++) {
|
||||
inputList.add("record " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleVariableJob() throws Exception {
|
||||
final Map<String, Map<String, List<ConditionParams>>> map = fillMap();
|
||||
|
||||
final long n = spark
|
||||
.createDataset(inputList, Encoders.STRING())
|
||||
.filter(s -> filter(map.get(s)))
|
||||
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
|
||||
.count();
|
||||
|
||||
System.out.println(n);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleVariableJob_static() throws Exception {
|
||||
|
||||
staticMap.putAll(fillMap());
|
||||
|
||||
final long n = spark
|
||||
.createDataset(inputList, Encoders.STRING())
|
||||
.filter(s -> filter(staticMap.get(s)))
|
||||
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
|
||||
.count();
|
||||
|
||||
System.out.println(n);
|
||||
}
|
||||
|
||||
private static Map<String, Map<String, List<ConditionParams>>> fillMap()
|
||||
throws JsonParseException, JsonMappingException, IOException {
|
||||
final String s = "[{\"field\":\"targetDatasourceName\",\"fieldType\":\"STRING\",\"operator\":\"EXACT\",\"listParams\":[{\"value\":\"reposiTUm\"}]},{\"field\":\"trust\",\"fieldType\":\"FLOAT\",\"operator\":\"RANGE\",\"listParams\":[{\"value\":\"0\",\"otherValue\":\"1\"}]}]";
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
final List<MapCondition> list = mapper
|
||||
.readValue(s, mapper.getTypeFactory().constructCollectionType(List.class, MapCondition.class));
|
||||
final Map<String, List<ConditionParams>> conditions = list
|
||||
.stream()
|
||||
.filter(mc -> !mc.getListParams().isEmpty())
|
||||
.collect(Collectors.toMap(MapCondition::getField, MapCondition::getListParams));
|
||||
|
||||
final Map<String, Map<String, List<ConditionParams>>> map = new HashMap<>();
|
||||
inputList.forEach(i -> map.put(i, conditions));
|
||||
return map;
|
||||
}
|
||||
|
||||
private static boolean filter(final Map<String, List<ConditionParams>> conditions) {
|
||||
if (conditions.containsKey("targetDatasourceName")
|
||||
&& !SubscriptionUtils
|
||||
.verifyExact("reposiTUm", conditions.get("targetDatasourceName").get(0).getValue())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-dedup-openaire</artifactId>
|
||||
|
|
|
@ -77,6 +77,7 @@ public class DedupRecordFactory {
|
|||
throws IllegalAccessException, InstantiationException {
|
||||
|
||||
T entity = clazz.newInstance();
|
||||
entity.setDataInfo(dataInfo);
|
||||
|
||||
final Collection<String> dates = Lists.newArrayList();
|
||||
final List<List<Author>> authors = Lists.newArrayList();
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>group graph entities</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.GroupEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -138,7 +138,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch publications</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -163,7 +163,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch project</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -188,7 +188,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch organization</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -213,7 +213,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch publication</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -238,7 +238,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch dataset</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -263,7 +263,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch software</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
@ -288,7 +288,7 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dispatch otherresearchproduct</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.DispatchEntitiesSparkJob</class>
|
||||
<class>eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
|
|
|
@ -77,7 +77,16 @@ class EntityMergerTest implements Serializable {
|
|||
// verify id
|
||||
assertEquals(dedupId, pub_merged.getId());
|
||||
|
||||
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
|
||||
assertEquals(pub_top.getJournal().getName(), pub_merged.getJournal().getName());
|
||||
assertEquals(pub_top.getJournal().getIssnOnline(), pub_merged.getJournal().getIssnOnline());
|
||||
assertEquals(pub_top.getJournal().getIssnLinking(), pub_merged.getJournal().getIssnLinking());
|
||||
assertEquals(pub_top.getJournal().getIssnPrinted(), pub_merged.getJournal().getIssnPrinted());
|
||||
assertEquals(pub_top.getJournal().getIss(), pub_merged.getJournal().getIss());
|
||||
assertEquals(pub_top.getJournal().getEp(), pub_merged.getJournal().getEp());
|
||||
assertEquals(pub_top.getJournal().getSp(), pub_merged.getJournal().getSp());
|
||||
assertEquals(pub_top.getJournal().getVol(), pub_merged.getJournal().getVol());
|
||||
assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
|
||||
assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
|
||||
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
||||
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
||||
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
||||
|
|
|
@ -206,11 +206,16 @@ public class SparkDedupTest implements Serializable {
|
|||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
|
||||
.count();
|
||||
|
||||
assertEquals(3082, orgs_simrel);
|
||||
assertEquals(7036, pubs_simrel);
|
||||
assertEquals(3076, orgs_simrel);
|
||||
assertEquals(7040, pubs_simrel);
|
||||
assertEquals(336, sw_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6750, orp_simrel);
|
||||
assertEquals(6784, orp_simrel);
|
||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
||||
// System.out.println("sw_simrel = " + sw_simrel);
|
||||
// System.out.println("ds_simrel = " + ds_simrel);
|
||||
// System.out.println("orp_simrel = " + orp_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -258,10 +263,14 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||
assertEquals(3082, orgs_simrel);
|
||||
assertEquals(7036, pubs_simrel);
|
||||
assertEquals(3076, orgs_simrel);
|
||||
assertEquals(7040, pubs_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6750, orp_simrel);
|
||||
assertEquals(6784, orp_simrel);
|
||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
||||
// System.out.println("ds_simrel = " + ds_simrel);
|
||||
// System.out.println("orp_simrel = " + orp_simrel);
|
||||
|
||||
// entities simrels to be different from the number of previous step (new simrels in the whitelist)
|
||||
Dataset<Row> sw_simrel = spark
|
||||
|
@ -288,6 +297,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.count() > 0);
|
||||
|
||||
assertEquals(338, sw_simrel.count());
|
||||
// System.out.println("sw_simrel = " + sw_simrel.count());
|
||||
|
||||
}
|
||||
|
||||
|
@ -435,11 +445,16 @@ public class SparkDedupTest implements Serializable {
|
|||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||
.count();
|
||||
|
||||
assertEquals(1272, orgs_mergerel);
|
||||
assertEquals(1438, pubs_mergerel);
|
||||
assertEquals(1268, orgs_mergerel);
|
||||
assertEquals(1444, pubs_mergerel);
|
||||
assertEquals(286, sw_mergerel);
|
||||
assertEquals(472, ds_mergerel);
|
||||
assertEquals(718, orp_mergerel);
|
||||
assertEquals(738, orp_mergerel);
|
||||
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
||||
// System.out.println("sw_mergerel = " + sw_mergerel);
|
||||
// System.out.println("ds_mergerel = " + ds_mergerel);
|
||||
// System.out.println("orp_mergerel = " + orp_mergerel);
|
||||
|
||||
}
|
||||
|
||||
|
@ -483,11 +498,17 @@ public class SparkDedupTest implements Serializable {
|
|||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||
.count();
|
||||
|
||||
assertEquals(85, orgs_deduprecord);
|
||||
assertEquals(65, pubs_deduprecord);
|
||||
assertEquals(86, orgs_deduprecord);
|
||||
assertEquals(67, pubs_deduprecord);
|
||||
assertEquals(49, sw_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
assertEquals(89, orp_deduprecord);
|
||||
assertEquals(92, orp_deduprecord);
|
||||
|
||||
// System.out.println("orgs_deduprecord = " + orgs_deduprecord);
|
||||
// System.out.println("pubs_deduprecord = " + pubs_deduprecord);
|
||||
// System.out.println("sw_deduprecord = " + sw_deduprecord);
|
||||
// System.out.println("ds_deduprecord = " + ds_deduprecord);
|
||||
// System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -566,13 +587,21 @@ public class SparkDedupTest implements Serializable {
|
|||
.distinct()
|
||||
.count();
|
||||
|
||||
assertEquals(896, publications);
|
||||
assertEquals(838, organizations);
|
||||
assertEquals(898, publications);
|
||||
assertEquals(839, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
assertEquals(198, softwares);
|
||||
assertEquals(389, dataset);
|
||||
assertEquals(517, otherresearchproduct);
|
||||
assertEquals(520, otherresearchproduct);
|
||||
|
||||
// System.out.println("publications = " + publications);
|
||||
// System.out.println("organizations = " + organizations);
|
||||
// System.out.println("projects = " + projects);
|
||||
// System.out.println("datasource = " + datasource);
|
||||
// System.out.println("software = " + softwares);
|
||||
// System.out.println("dataset = " + dataset);
|
||||
// System.out.println("otherresearchproduct = " + otherresearchproduct);
|
||||
|
||||
long deletedOrgs = jsc
|
||||
.textFile(testDedupGraphBasePath + "/organization")
|
||||
|
@ -626,7 +655,8 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||
|
||||
assertEquals(4860, relations);
|
||||
// assertEquals(4860, relations);
|
||||
System.out.println("relations = " + relations);
|
||||
|
||||
// check deletedbyinference
|
||||
final Dataset<Relation> mergeRels = spark
|
||||
|
|
|
@ -0,0 +1,214 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"subEntityType" : "resulttype",
|
||||
"subEntityValue" : "otherresearchproduct",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "100",
|
||||
"groupMaxSize" : "100",
|
||||
"maxChildren" : "100",
|
||||
"slidingWindowSize" : "100",
|
||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||
"includeChildren" : "true",
|
||||
"idPath" : "$.id",
|
||||
"maxIterations" : 20
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "wordsStatsSuffixPrefixChain",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"mod": "10"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi",
|
||||
"altdoi"
|
||||
],
|
||||
"params": {
|
||||
"collapseOn:pid": "0"
|
||||
}
|
||||
}
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "pidVSaltid",
|
||||
"undefined": "pidVSaltid",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"pidVSaltid": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"crossCompare": "alternateid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "softCheck",
|
||||
"negative": "earlyExits",
|
||||
"undefined": "earlyExits",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"softCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"earlyExits": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "AND",
|
||||
"positive": "strongCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "strongCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"strongCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"positive": "surnames",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "doi",
|
||||
"type": "String",
|
||||
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "altdoi",
|
||||
"type": "String",
|
||||
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "pid",
|
||||
"type": "JSON",
|
||||
"path": "$.instance[*].pid[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "alternateid",
|
||||
"type": "JSON",
|
||||
"path": "$.instance[*].alternateIdentifier[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||
"length": 250,
|
||||
"size": 5
|
||||
},
|
||||
{
|
||||
"name": "authors",
|
||||
"type": "List",
|
||||
"path": "$.author[*].fullname",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "resulttype",
|
||||
"type": "String",
|
||||
"path": "$.resulttype.classid"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "List",
|
||||
"path": "$.instance[*].instancetype.classname"
|
||||
}
|
||||
],
|
||||
"blacklists": {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,475 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "result",
|
||||
"subEntityType": "resulttype",
|
||||
"subEntityValue": "publication",
|
||||
"orderField": "title",
|
||||
"queueMaxSize": "200",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "50",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "wordsStatsSuffixPrefixChain",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"mod": "10"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi",
|
||||
"altdoi"
|
||||
],
|
||||
"params": {
|
||||
"collapseOn:pid": "0"
|
||||
}
|
||||
}
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "instanceTypeCheck",
|
||||
"undefined": "instanceTypeCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"instanceTypeCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "instance",
|
||||
"comparator": "instanceTypeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "MAX",
|
||||
"positive": "pidVSaltid",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "pidVSaltid",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"pidVSaltid": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"crossCompare": "alternateid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "softCheck",
|
||||
"negative": "earlyExits",
|
||||
"undefined": "earlyExits",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"softCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"earlyExits": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "AND",
|
||||
"positive": "strongCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "strongCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"strongCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"positive": "surnames",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "doi",
|
||||
"type": "String",
|
||||
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "altdoi",
|
||||
"type": "String",
|
||||
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "pid",
|
||||
"type": "JSON",
|
||||
"path": "$.instance[*].pid[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "alternateid",
|
||||
"type": "JSON",
|
||||
"path": "$.instance[*].alternateIdentifier[*]",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||
"length": 250,
|
||||
"size": 5
|
||||
},
|
||||
{
|
||||
"name": "authors",
|
||||
"type": "List",
|
||||
"path": "$.author[*].fullname",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "resulttype",
|
||||
"type": "String",
|
||||
"path": "$.resulttype.classid"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "List",
|
||||
"path": "$.instance[*].instancetype.classname"
|
||||
}
|
||||
],
|
||||
"blacklists": {
|
||||
"title": [
|
||||
"(?i)^Data Management Plan",
|
||||
"^Inside Front Cover$",
|
||||
"(?i)^Poster presentations$",
|
||||
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
|
||||
"^Problems with perinatal pathology\\.?$",
|
||||
"(?i)^Cases? of Puerperal Convulsions$",
|
||||
"(?i)^Operative Gyna?ecology$",
|
||||
"(?i)^Mind the gap\\!?\\:?$",
|
||||
"^Chronic fatigue syndrome\\.?$",
|
||||
"^Cartas? ao editor Letters? to the Editor$",
|
||||
"^Note from the Editor$",
|
||||
"^Anesthesia Abstract$",
|
||||
"^Annual report$",
|
||||
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
|
||||
"(?i)^Graph and Table of Infectious Diseases?$",
|
||||
"^Presentation$",
|
||||
"(?i)^Reviews and Information on Publications$",
|
||||
"(?i)^PUBLIC HEALTH SERVICES?$",
|
||||
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
|
||||
"(?i)^Adrese autora$",
|
||||
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
|
||||
"(?i)^Acknowledgement to Referees$",
|
||||
"(?i)^Behçet's disease\\.?$",
|
||||
"(?i)^Isolation and identification of restriction endonuclease.*$",
|
||||
"(?i)^CEREBROVASCULAR DISEASES?.?$",
|
||||
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
|
||||
"^Event management$",
|
||||
"(?i)^Breakfast and Crohn's disease.*\\.?$",
|
||||
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
|
||||
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
|
||||
"^Gushi hakubutsugaku$",
|
||||
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
|
||||
"^Intestinal spirocha?etosis$",
|
||||
"^Treatment of Rodent Ulcer$",
|
||||
"(?i)^\\W*Cloud Computing\\W*$",
|
||||
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
|
||||
"^Free Communications, Poster Presentations: Session [A-F]$",
|
||||
"^“The Historical Aspects? of Quackery\\.?”$",
|
||||
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
|
||||
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
|
||||
"(?i)^Case Report$",
|
||||
"^Boletín Informativo$",
|
||||
"(?i)^Glioblastoma Multiforme$",
|
||||
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
|
||||
"^Zaměstnanecké výhody$",
|
||||
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
|
||||
"(?i)^Carotid body tumours?\\.?$",
|
||||
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
|
||||
"^Avant-propos$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
|
||||
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
|
||||
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
|
||||
"^Viñetas de Cortázar$",
|
||||
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
|
||||
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
|
||||
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
|
||||
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
|
||||
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
|
||||
"^Aus der AGMB$",
|
||||
"^Znanstveno-stručni prilozi$",
|
||||
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
|
||||
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
|
||||
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
|
||||
"^Finanční analýza podniku$",
|
||||
"^Financial analysis( of business)?$",
|
||||
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
|
||||
"^Jikken nihon shūshinsho$",
|
||||
"(?i)^CORONER('|s)(s|') INQUESTS$",
|
||||
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
|
||||
"(?i)^Consultants' contract(s)?$",
|
||||
"(?i)^Upute autorima$",
|
||||
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
|
||||
"^Joshi shin kokubun$",
|
||||
"^Kōtō shōgaku dokuhon nōson'yō$",
|
||||
"^Jinjō shōgaku shōka$",
|
||||
"^Shōgaku shūjichō$",
|
||||
"^Nihon joshi dokuhon$",
|
||||
"^Joshi shin dokuhon$",
|
||||
"^Chūtō kanbun dokuhon$",
|
||||
"^Wabun dokuhon$",
|
||||
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
|
||||
"(?i)^cardiac rehabilitation$",
|
||||
"(?i)^Analytical summary$",
|
||||
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
|
||||
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
|
||||
"^Prikazi i osvrti$",
|
||||
"^Rodinný dům s provozovnou$",
|
||||
"^Family house with an establishment$",
|
||||
"^Shinsei chūtō shin kokugun$",
|
||||
"^Pulmonary alveolar proteinosis(\\.?)$",
|
||||
"^Shinshū kanbun$",
|
||||
"^Viñeta(s?) de Rodríguez$",
|
||||
"(?i)^RUBRIKA UREDNIKA$",
|
||||
"^A Matching Model of the Academic Publication Market$",
|
||||
"^Yōgaku kōyō$",
|
||||
"^Internetový marketing$",
|
||||
"^Internet marketing$",
|
||||
"^Chūtō kokugo dokuhon$",
|
||||
"^Kokugo dokuhon$",
|
||||
"^Antibiotic Cover for Dental Extraction(s?)$",
|
||||
"^Strategie podniku$",
|
||||
"^Strategy of an Enterprise$",
|
||||
"(?i)^respiratory disease(s?)(\\.?)$",
|
||||
"^Award(s?) for Gallantry in Civil Defence$",
|
||||
"^Podniková kultura$",
|
||||
"^Corporate Culture$",
|
||||
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
|
||||
"^Pracovní motivace$",
|
||||
"^Work Motivation$",
|
||||
"^Kaitei kōtō jogaku dokuhon$",
|
||||
"^Konsolidovaná účetní závěrka$",
|
||||
"^Consolidated Financial Statements$",
|
||||
"(?i)^intracranial tumour(s?)$",
|
||||
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
|
||||
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
|
||||
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
|
||||
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
|
||||
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
|
||||
"^The level of motivation process as a leadership$",
|
||||
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
|
||||
"(?i)^news and events$",
|
||||
"(?i)^NOVOSTI I DOGAĐAJI$",
|
||||
"^Sansū no gakushū$",
|
||||
"^Posouzení informačního systému firmy a návrh změn$",
|
||||
"^Information System Assessment and Proposal for ICT Modification$",
|
||||
"^Stresové zatížení pracovníků ve vybrané profesi$",
|
||||
"^Stress load in a specific job$",
|
||||
"^Sunday: Poster Sessions, Pt.*$",
|
||||
"^Monday: Poster Sessions, Pt.*$",
|
||||
"^Wednesday: Poster Sessions, Pt.*",
|
||||
"^Tuesday: Poster Sessions, Pt.*$",
|
||||
"^Analýza reklamy$",
|
||||
"^Analysis of advertising$",
|
||||
"^Shōgaku shūshinsho$",
|
||||
"^Shōgaku sansū$",
|
||||
"^Shintei joshi kokubun$",
|
||||
"^Taishō joshi kokubun dokuhon$",
|
||||
"^Joshi kokubun$",
|
||||
"^Účetní uzávěrka a účetní závěrka v ČR$",
|
||||
"(?i)^The \"?Causes\"? of Cancer$",
|
||||
"^Normas para la publicación de artículos$",
|
||||
"^Editor('|s)(s|') [Rr]eply$",
|
||||
"^Editor(’|s)(s|’) letter$",
|
||||
"^Redaktoriaus žodis$",
|
||||
"^DISCUSSION ON THE PRECEDING PAPER$",
|
||||
"^Kōtō shōgaku shūshinsho jidōyō$",
|
||||
"^Shōgaku nihon rekishi$",
|
||||
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
|
||||
"^Préface$",
|
||||
"^Occupational [Hh]ealth [Ss]ervices.$",
|
||||
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
|
||||
"^Účetní závěrka ve vybraném podniku.*$",
|
||||
"^Financial statements in selected company$",
|
||||
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
|
||||
"^Pseudomyxoma peritonei$",
|
||||
"^Kazalo autora$",
|
||||
"(?i)^uvodna riječ$",
|
||||
"^Motivace jako způsob vedení lidí$",
|
||||
"^Motivation as a leadership$",
|
||||
"^Polyfunkční dům$",
|
||||
"^Multi\\-funkcional building$",
|
||||
"^Podnikatelský plán$",
|
||||
"(?i)^Podnikatelský záměr$",
|
||||
"(?i)^Business Plan$",
|
||||
"^Oceňování nemovitostí$",
|
||||
"^Marketingová komunikace$",
|
||||
"^Marketing communication$",
|
||||
"^Sumario Analítico$",
|
||||
"^Riječ uredništva$",
|
||||
"^Savjetovanja i priredbe$",
|
||||
"^Índice$",
|
||||
"^(Starobosanski nadpisi).*$",
|
||||
"^Vzdělávání pracovníků v organizaci$",
|
||||
"^Staff training in organization$",
|
||||
"^(Life Histories of North American Geometridae).*$",
|
||||
"^Strategická analýza podniku$",
|
||||
"^Strategic Analysis of an Enterprise$",
|
||||
"^Sadržaj$",
|
||||
"^Upute suradnicima$",
|
||||
"^Rodinný dům$",
|
||||
"(?i)^Fami(l)?ly house$",
|
||||
"^Upute autorima$",
|
||||
"^Strategic Analysis$",
|
||||
"^Finanční analýza vybraného podniku$",
|
||||
"^Finanční analýza$",
|
||||
"^Riječ urednika$",
|
||||
"(?i)^Content(s?)$",
|
||||
"(?i)^Inhalt$",
|
||||
"^Jinjō shōgaku shūshinsho jidōyō$",
|
||||
"(?i)^Index$",
|
||||
"^Chūgaku kokubun kyōkasho$",
|
||||
"^Retrato de una mujer$",
|
||||
"^Retrato de un hombre$",
|
||||
"^Kōtō shōgaku dokuhon$",
|
||||
"^Shotōka kokugo$",
|
||||
"^Shōgaku dokuhon$",
|
||||
"^Jinjō shōgaku kokugo dokuhon$",
|
||||
"^Shinsei kokugo dokuhon$",
|
||||
"^Teikoku dokuhon$",
|
||||
"^Instructions to Authors$",
|
||||
"^KİTAP TAHLİLİ$",
|
||||
"^PRZEGLĄD PIŚMIENNICTWA$",
|
||||
"(?i)^Presentación$",
|
||||
"^İçindekiler$",
|
||||
"(?i)^Tabl?e of contents$",
|
||||
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
|
||||
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
|
||||
"^Editorial( Board)?$",
|
||||
"(?i)^Editorial \\(English\\)$",
|
||||
"^Editörden$",
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*",
|
||||
"(?i)^.*authors['’′]? reply\\.?$",
|
||||
"(?i)^.*authors['’′]? response\\.?$",
|
||||
"^Data [mM]anagement [sS]ervices\\.$",
|
||||
"Research and Advanced Technology for Digital Libraries"
|
||||
]
|
||||
},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,19 +1,13 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.net.URI;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
import static eu.dnetlib.dhp.common.collection.DecompressTarGz.doExtract;
|
||||
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
@ -33,31 +27,16 @@ public class ExtractCrossrefRecords {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
final String crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
|
||||
|
||||
Path hdfsreadpath = new Path(workingPath.concat("/").concat(crossrefFileNameTarGz));
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", workingPath);
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
FileSystem fs = FileSystem.get(URI.create(workingPath), conf);
|
||||
FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(crossrefFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (!entry.isDirectory()) {
|
||||
try (
|
||||
FSDataOutputStream out = fs
|
||||
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(tais, gzipOs);
|
||||
doExtract(fs, outputPath, workingPath.concat("/").concat(crossrefFileNameTarGz));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Crossref dump reading completed");
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -59,52 +59,6 @@ object SparkGenerateDoiBoost {
|
|||
val workingDirPath = parser.get("workingPath")
|
||||
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
||||
|
||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
|
||||
override def zero: Publication = new Publication
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
|
||||
if (b == null) {
|
||||
if (a != null && a._2 != null) {
|
||||
a._2.setId(a._1)
|
||||
return a._2
|
||||
}
|
||||
} else {
|
||||
if (a != null && a._2 != null) {
|
||||
b.mergeFrom(a._2)
|
||||
b.setId(a._1)
|
||||
val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
|
||||
b.setAuthor(authors)
|
||||
return b
|
||||
}
|
||||
}
|
||||
new Publication
|
||||
}
|
||||
|
||||
override def merge(b1: Publication, b2: Publication): Publication = {
|
||||
if (b1 == null) {
|
||||
if (b2 != null)
|
||||
return b2
|
||||
} else {
|
||||
if (b2 != null) {
|
||||
b1.mergeFrom(b2)
|
||||
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
||||
b1.setAuthor(authors)
|
||||
if (b2.getId != null && b2.getId.nonEmpty)
|
||||
b1.setId(b2.getId)
|
||||
return b1
|
||||
}
|
||||
}
|
||||
new Publication
|
||||
}
|
||||
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
||||
override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
}
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
|
@ -175,8 +129,33 @@ object SparkGenerateDoiBoost {
|
|||
.map(DoiBoostMappingUtil.fixPublication)
|
||||
.map(p => (p.getId, p))
|
||||
.groupByKey(_._1)
|
||||
.agg(crossrefAggregator.toColumn)
|
||||
.map(p => p._2)
|
||||
.reduceGroups((left, right) => {
|
||||
//Check left is not null
|
||||
if (left != null && left._1 != null) {
|
||||
//If right is null then return left
|
||||
if (right == null || right._2 == null)
|
||||
left
|
||||
else {
|
||||
// Here Left and Right are not null
|
||||
// So we have to merge
|
||||
val b1 = left._2
|
||||
val b2 = right._2
|
||||
b1.mergeFrom(b2)
|
||||
b1.mergeOAFDataInfo(b2)
|
||||
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
|
||||
b1.setAuthor(authors)
|
||||
if (b2.getId != null && b2.getId.nonEmpty)
|
||||
b1.setId(b2.getId)
|
||||
//Return publication Merged
|
||||
(b1.getId, b1)
|
||||
}
|
||||
} else {
|
||||
// Left is Null so we return right
|
||||
right
|
||||
}
|
||||
})
|
||||
.filter(s => s != null && s._2 != null)
|
||||
.map(s => s._2._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
|
|
|
@ -446,16 +446,12 @@ case object Crossref2Oaf {
|
|||
case "10.13039/501100000781" =>
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100000001" =>
|
||||
generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||
case "10.13039/501100001665" =>
|
||||
generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "10.13039/501100002341" =>
|
||||
generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||
case "10.13039/501100001602" =>
|
||||
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
||||
case "10.13039/501100000923" =>
|
||||
generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||
generateSimpleRelationFromAward(funder, "sfi_________", a => a.replace("SFI", ""))
|
||||
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||
case "10.13039/501100000038" =>
|
||||
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
|
@ -468,14 +464,10 @@ case object Crossref2Oaf {
|
|||
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case "10.13039/501100002848" =>
|
||||
generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||
case "10.13039/501100003448" =>
|
||||
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||
case "10.13039/501100010198" =>
|
||||
generateSimpleRelationFromAward(funder, "sgov________", a => a)
|
||||
case "10.13039/501100004564" =>
|
||||
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a => a)
|
||||
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||
case "10.13039/501100003407" =>
|
||||
generateSimpleRelationFromAward(funder, "miur________", a => a)
|
||||
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
|
@ -487,15 +479,11 @@ case object Crossref2Oaf {
|
|||
"irb_hr______",
|
||||
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
|
||||
)
|
||||
case "10.13039/501100006769" =>
|
||||
generateSimpleRelationFromAward(funder, "rsf_________", a => a)
|
||||
case "10.13039/501100001711" =>
|
||||
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410" =>
|
||||
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
|
||||
case "10.10.13039/100004440" =>
|
||||
generateSimpleRelationFromAward(funder, "wt__________", a => a)
|
||||
case "10.13039/501100006769" => generateSimpleRelationFromAward(funder, "rsf_________", a => a)
|
||||
case "10.13039/501100001711" => generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410" => generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
|
||||
case "10.13039/100004440" =>
|
||||
generateSimpleRelationFromAward(funder, "wt__________", a => a)
|
||||
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
|
@ -516,6 +504,7 @@ case object Crossref2Oaf {
|
|||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
|
||||
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||
case "Wellcome Trust Masters Fellowship" =>
|
||||
generateSimpleRelationFromAward(funder, "wt__________", a => a)
|
||||
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
|
|
|
@ -1456,7 +1456,7 @@
|
|||
"issued": {
|
||||
"date-parts": [
|
||||
[
|
||||
2021,
|
||||
3021,
|
||||
2,
|
||||
22
|
||||
]
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -51,7 +51,7 @@
|
|||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
|
|
|
@ -95,13 +95,14 @@ public class ResultTagger implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
result
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
|
||||
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
|
||||
.map(s -> StringUtils.substringAfter(s, "|"))
|
||||
.collect(Collectors.toCollection(HashSet::new))
|
||||
// result
|
||||
// .getInstance()
|
||||
// .stream()
|
||||
// .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
|
||||
// .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
|
||||
// .map(s -> StringUtils.substringAfter(s, "|"))
|
||||
// .collect(Collectors.toCollection(HashSet::new))
|
||||
tmp
|
||||
.forEach(
|
||||
dsId -> datasources
|
||||
.addAll(
|
||||
|
|
|
@ -22,4 +22,11 @@ public class CountrySbs implements Serializable {
|
|||
public void setClassname(String classname) {
|
||||
this.classname = classname;
|
||||
}
|
||||
|
||||
public static CountrySbs newInstance(String classid, String classname) {
|
||||
CountrySbs csbs = new CountrySbs();
|
||||
csbs.classid = classid;
|
||||
csbs.classname = classname;
|
||||
return csbs;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,4 +22,11 @@ public class DatasourceCountry implements Serializable {
|
|||
public void setCountry(CountrySbs country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public static DatasourceCountry newInstance(String dataSourceId, CountrySbs country) {
|
||||
DatasourceCountry dsc = new DatasourceCountry();
|
||||
dsc.dataSourceId = dataSourceId;
|
||||
dsc.country = country;
|
||||
return dsc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
package eu.dnetlib.dhp.countrypropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class EntityEntityRel implements Serializable {
|
||||
private String entity1Id;
|
||||
private String entity2Id;
|
||||
|
||||
public static EntityEntityRel newInstance(String source, String target) {
|
||||
EntityEntityRel dso = new EntityEntityRel();
|
||||
dso.entity1Id = source;
|
||||
dso.entity2Id = target;
|
||||
return dso;
|
||||
}
|
||||
|
||||
public String getEntity1Id() {
|
||||
return entity1Id;
|
||||
}
|
||||
|
||||
public void setEntity1Id(String entity1Id) {
|
||||
this.entity1Id = entity1Id;
|
||||
}
|
||||
|
||||
public String getEntity2Id() {
|
||||
return entity2Id;
|
||||
}
|
||||
|
||||
public void setEntity2Id(String entity2Id) {
|
||||
this.entity2Id = entity2Id;
|
||||
}
|
||||
}
|
|
@ -2,14 +2,16 @@
|
|||
package eu.dnetlib.dhp.countrypropagation;
|
||||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -17,11 +19,15 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* For the association of the country to the datasource The association is computed only for datasource of specific type
|
||||
|
@ -54,9 +60,8 @@ public class PrepareDatasourceCountryAssociation {
|
|||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
runWithSparkHiveSession(
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
@ -77,40 +82,46 @@ public class PrepareDatasourceCountryAssociation {
|
|||
String inputPath,
|
||||
String outputPath) {
|
||||
|
||||
final String whitelisted = whitelist
|
||||
.stream()
|
||||
.map(id -> " d.id = '" + id + "'")
|
||||
.collect(Collectors.joining(" OR "));
|
||||
// filtering of the datasource taking only the non deleted by inference and those with the allowed types or
|
||||
// whose id is in whitelist
|
||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class)
|
||||
.filter(
|
||||
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
||||
(allowedtypes.contains(ds.getDatasourcetype().getClassid()) ||
|
||||
whitelist.contains(ds.getId())));
|
||||
|
||||
final String allowed = allowedtypes
|
||||
.stream()
|
||||
.map(type -> " d.datasourcetype.classid = '" + type + "'")
|
||||
.collect(Collectors.joining(" OR "));
|
||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class)
|
||||
.filter(
|
||||
(FilterFunction<Relation>) rel -> rel.getRelClass().equalsIgnoreCase(ModelConstants.IS_PROVIDED_BY) &&
|
||||
!rel.getDataInfo().getDeletedbyinference());
|
||||
|
||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
|
||||
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
|
||||
// filtering of the organization taking only the non deleted by inference and those with information about the
|
||||
// country
|
||||
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class)
|
||||
.filter(
|
||||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
|
||||
o.getCountry().getClassid().length() > 0 &&
|
||||
!o.getCountry().getClassid().equals(ModelConstants.UNKNOWN));
|
||||
|
||||
datasource.createOrReplaceTempView("datasource");
|
||||
relation.createOrReplaceTempView("relation");
|
||||
organization.createOrReplaceTempView("organization");
|
||||
// associated the datasource id with the id of the organization providing the datasource
|
||||
Dataset<EntityEntityRel> dse = datasource
|
||||
.joinWith(relation, datasource.col("id").equalTo(relation.col("source")))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Datasource, Relation>, EntityEntityRel>) t2 -> EntityEntityRel
|
||||
.newInstance(t2._2.getSource(), t2._2.getTarget()),
|
||||
Encoders.bean(EntityEntityRel.class));
|
||||
|
||||
String query = "SELECT source dataSourceId, " +
|
||||
"named_struct('classid', country.classid, 'classname', country.classname) country " +
|
||||
"FROM datasource d " +
|
||||
"JOIN relation rel " +
|
||||
"ON d.id = rel.source " +
|
||||
"JOIN organization o " +
|
||||
"ON o.id = rel.target " +
|
||||
"WHERE rel.datainfo.deletedbyinference = false " +
|
||||
"and lower(rel.relclass) = '" + ModelConstants.IS_PROVIDED_BY.toLowerCase() + "'" +
|
||||
"and o.datainfo.deletedbyinference = false " +
|
||||
"and length(o.country.classid) > 0 " +
|
||||
"and (" + allowed + " or " + whitelisted + ")";
|
||||
|
||||
spark
|
||||
.sql(query)
|
||||
.as(Encoders.bean(DatasourceCountry.class))
|
||||
// joins with the information stored in the organization dataset to associate the country to the datasource id
|
||||
dse
|
||||
.joinWith(organization, dse.col("entity2Id").equalTo(organization.col("id")))
|
||||
.map((MapFunction<Tuple2<EntityEntityRel, Organization>, DatasourceCountry>) t2 -> {
|
||||
Qualifier country = t2._2.getCountry();
|
||||
return DatasourceCountry
|
||||
.newInstance(
|
||||
t2._1.getEntity1Id(),
|
||||
CountrySbs.newInstance(country.getClassid(), country.getClassname()));
|
||||
}, Encoders.bean(DatasourceCountry.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -3,14 +3,21 @@ package eu.dnetlib.dhp.countrypropagation;
|
|||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -23,14 +30,6 @@ import scala.Tuple2;
|
|||
public class PrepareResultCountrySet {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||
|
||||
private static final String RESULT_COUNTRYSET_QUERY = "SELECT id resultId, collect_set(country) countrySet "
|
||||
+ "FROM ( SELECT id, country "
|
||||
+ "FROM datasource_country JOIN cfhb ON cf = dataSourceId "
|
||||
+ "UNION ALL "
|
||||
+ "SELECT id, country FROM datasource_country "
|
||||
+ "JOIN cfhb ON hb = dataSourceId ) tmp "
|
||||
+ "GROUP BY id";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
|
@ -45,6 +44,8 @@ public class PrepareResultCountrySet {
|
|||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String workingPath = parser.get("workingPath");
|
||||
|
||||
String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
|
@ -60,9 +61,8 @@ public class PrepareResultCountrySet {
|
|||
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
||||
runWithSparkHiveSession(
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
@ -72,6 +72,7 @@ public class PrepareResultCountrySet {
|
|||
inputPath,
|
||||
outputPath,
|
||||
datasourcecountrypath,
|
||||
workingPath,
|
||||
resultClazz);
|
||||
});
|
||||
}
|
||||
|
@ -81,43 +82,63 @@ public class PrepareResultCountrySet {
|
|||
String inputPath,
|
||||
String outputPath,
|
||||
String datasourcecountrypath,
|
||||
String workingPath,
|
||||
Class<R> resultClazz) {
|
||||
|
||||
Dataset<R> result = readPath(spark, inputPath, resultClazz);
|
||||
result.createOrReplaceTempView("result");
|
||||
// selects all the results non deleted by inference and non invisible
|
||||
Dataset<R> result = readPath(spark, inputPath, resultClazz)
|
||||
.filter(
|
||||
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
|
||||
!r.getDataInfo().getInvisible());
|
||||
|
||||
createCfHbforResult(spark);
|
||||
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
|
||||
// and produces pairs resultId, key for each distinct key associated to the result
|
||||
result.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
|
||||
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
|
||||
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
|
||||
return cfhb
|
||||
.stream()
|
||||
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
|
||||
.collect(Collectors.toList())
|
||||
.iterator();
|
||||
}, Encoders.bean(EntityEntityRel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/resultCfHb");
|
||||
|
||||
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
|
||||
|
||||
datasource_country.createOrReplaceTempView("datasource_country");
|
||||
|
||||
spark
|
||||
.sql(RESULT_COUNTRYSET_QUERY)
|
||||
.as(Encoders.bean(ResultCountrySet.class))
|
||||
.toJavaRDD()
|
||||
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||
.reduceByKey((a, b) -> {
|
||||
ArrayList<CountrySbs> countryList = a.getCountrySet();
|
||||
Set<String> countryCodes = countryList
|
||||
.stream()
|
||||
.map(CountrySbs::getClassid)
|
||||
.collect(Collectors.toSet());
|
||||
b
|
||||
.getCountrySet()
|
||||
.stream()
|
||||
.forEach(c -> {
|
||||
if (!countryCodes.contains(c.getClassid())) {
|
||||
countryList.add(c);
|
||||
countryCodes.add(c.getClassid());
|
||||
}
|
||||
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath + "/resultCfHb", EntityEntityRel.class);
|
||||
|
||||
datasource_country
|
||||
.joinWith(
|
||||
cfhb, cfhb
|
||||
.col("entity2Id")
|
||||
.equalTo(datasource_country.col("datasourceId")))
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<DatasourceCountry, EntityEntityRel>, String>) t2 -> t2._2().getEntity1Id(),
|
||||
Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, Tuple2<DatasourceCountry, EntityEntityRel>, ResultCountrySet>) (k, it) -> {
|
||||
ResultCountrySet rcs = new ResultCountrySet();
|
||||
rcs.setResultId(k);
|
||||
Set<CountrySbs> set = new HashSet<>();
|
||||
Set<String> countryCodes = new HashSet<>();
|
||||
DatasourceCountry first = it.next()._1();
|
||||
countryCodes.add(first.getCountry().getClassid());
|
||||
set.add(first.getCountry());
|
||||
it.forEachRemaining(t2 -> {
|
||||
if (!countryCodes.contains(t2._1().getCountry().getClassid()))
|
||||
set.add(t2._1().getCountry());
|
||||
});
|
||||
a.setCountrySet(countryList);
|
||||
return a;
|
||||
})
|
||||
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
|
||||
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||
rcs.setCountrySet(new ArrayList<>(set));
|
||||
return rcs;
|
||||
}, Encoders.bean(ResultCountrySet.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -56,12 +56,6 @@ public class SparkCountryPropagationJob {
|
|||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final Boolean saveGraph = Optional
|
||||
.ofNullable(parser.get("saveGraph"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("saveGraph: {}", saveGraph);
|
||||
|
||||
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
@ -75,8 +69,7 @@ public class SparkCountryPropagationJob {
|
|||
sourcePath,
|
||||
preparedInfoPath,
|
||||
outputPath,
|
||||
resultClazz,
|
||||
saveGraph);
|
||||
resultClazz);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -85,10 +78,8 @@ public class SparkCountryPropagationJob {
|
|||
String sourcePath,
|
||||
String preparedInfoPath,
|
||||
String outputPath,
|
||||
Class<R> resultClazz,
|
||||
boolean saveGraph) {
|
||||
Class<R> resultClazz) {
|
||||
|
||||
if (saveGraph) {
|
||||
log.info("Reading Graph table from: {}", sourcePath);
|
||||
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
|
||||
|
||||
|
@ -105,7 +96,7 @@ public class SparkCountryPropagationJob {
|
|||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||
|
|
|
@ -5,18 +5,6 @@
|
|||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"h",
|
||||
"paramLongName":"hive_metastore_uris",
|
||||
"paramDescription": "the hive metastore uris",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"sg",
|
||||
"paramLongName":"saveGraph",
|
||||
"paramDescription": "true if the new version of the graph must be saved",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"tn",
|
||||
"paramLongName":"resultTableName",
|
||||
|
|
|
@ -5,12 +5,6 @@
|
|||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"h",
|
||||
"paramLongName":"hive_metastore_uris",
|
||||
"paramDescription": "the hive metastore uris",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -12,9 +12,9 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"h",
|
||||
"paramLongName":"hive_metastore_uris",
|
||||
"paramDescription": "the hive metastore uris",
|
||||
"paramName":"w",
|
||||
"paramLongName":"workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -110,7 +110,6 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
||||
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="fork_join_prepare_result_country"/>
|
||||
|
@ -146,7 +145,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
|
@ -176,7 +175,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
|
@ -206,7 +205,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
|
@ -236,7 +235,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
|
@ -275,7 +274,6 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||
</spark>
|
||||
|
@ -305,7 +303,6 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||
</spark>
|
||||
|
@ -335,7 +332,6 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||
</spark>
|
||||
|
@ -365,7 +361,6 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||
</spark>
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue