1
0
Fork 0

merge branch with master

This commit is contained in:
Miriam Baglioni 2021-08-05 11:34:20 +02:00
commit ee13da9258
18 changed files with 523 additions and 94 deletions

View File

@ -1,2 +1,2 @@
# dnet-hadoop # dnet-hadoop
Dnet-hadoop is a tool for Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.

View File

@ -25,6 +25,11 @@
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
</dependency> </dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>

View File

@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException; import java.time.format.DateTimeParseException;
import java.util.*; import java.util.*;
import java.util.function.Function; import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import com.github.sisyphsu.dateparser.DateParserUtils; import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode;
public class GraphCleaningFunctions extends CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter( .filter(
sp -> sp sp -> {
final String title = sp
.getValue() .getValue()
.toLowerCase() .toLowerCase();
.replaceAll(TITLE_FILTER_REGEX, "") final String residual = Unidecode
.length() > TITLE_FILTER_RESIDUAL_LENGTH) .decode(title)
.replaceAll(TITLE_FILTER_REGEX, "");
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
})
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }

View File

@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException; import java.io.IOException;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import me.xuender.unidecode.Unidecode;
public class OafMapperUtilsTest { public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testUnidecode() {
assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
}
@Test @Test
public void testDateValidation() { public void testDateValidation() {

View File

@ -0,0 +1,127 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.FileNotFoundException;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PatchRelationsApplication {
private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Optional
.ofNullable(
PatchRelationsApplication.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
.orElseThrow(FileNotFoundException::new)));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphBasePath = parser.get("graphBasePath");
log.info("graphBasePath: {}", graphBasePath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String idMappingPath = parser.get("idMappingPath");
log.info("idMappingPath: {}", idMappingPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
}
/**
* Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
* mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
*
* @param spark the SparkSession
* @param graphBasePath base graph path providing the set of relations to patch
* @param workingDir intermediate storage location
* @param idMappingPath dataset providing the old -> new identifier mapping
*/
private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir,
final String idMappingPath) {
final String relationPath = graphBasePath + "/relation";
final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
log.info("relations: {}", rels.count());
log.info("idMapping: {}", idMapping.count());
final Dataset<Relation> bySource = rels
.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
final Relation r = t._1();
Optional
.ofNullable(t._2())
.map(RelationIdMapping::getNewId)
.ifPresent(r::setSource);
return r;
}, Encoders.bean(Relation.class));
bySource
.joinWith(idMapping, bySource.col("target").equalTo(idMapping.col("oldId")), "left")
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
final Relation r = t._1();
Optional
.ofNullable(t._2())
.map(RelationIdMapping::getNewId)
.ifPresent(r::setTarget);
return r;
}, Encoders.bean(Relation.class))
.map(
(MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(workingDir);
spark
.read()
.textFile(workingDir)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(relationPath);
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
public class RelationIdMapping {
private String oldId;
private String newId;
public String getOldId() {
return oldId;
}
public void setOldId(final String oldId) {
this.oldId = oldId;
}
public String getNewId() {
return newId;
}
public void setNewId(final String newId) {
this.newId = newId;
}
}

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "g",
"paramLongName": "graphBasePath",
"paramDescription": "base graph path providing the set of relations to patch",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDir",
"paramDescription": "intermediate storage location",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "idMappingPath",
"paramDescription": "dataset providing the old -> new identifier mapping",
"paramRequired": true
}
]

View File

@ -100,6 +100,16 @@
<value></value> <value></value>
<description>a blacklist of nsprefixes (comma separeted)</description> <description>a blacklist of nsprefixes (comma separeted)</description>
</property> </property>
<property>
<name>shouldPatchRelations</name>
<value>false</value>
<description>activates the relation patching phase, driven by the content in ${idMappingPath}</description>
</property>
<property>
<name>idMappingPath</name>
<value></value>
<description>path pointing to the relations identifiers mapping dataset</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -551,7 +561,6 @@
<path start="merge_claims_relation"/> <path start="merge_claims_relation"/>
</fork> </fork>
<action name="merge_claims_publication"> <action name="merge_claims_publication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -760,7 +769,42 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait_merge" to="End"/> <join name="wait_merge" to="decisionPatchRelations"/>
<decision name="decisionPatchRelations">
<switch>
<case to="patchRelations">
${(shouldPatchRelations eq "true") and
(fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
</case>
<default to="End"/>
</switch>
</decision>
<action name="patchRelations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PatchRelations</name>
<class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--graphBasePath</arg><arg>${graphOutputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
<arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals; import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
import static org.junit.jupiter.api.Assertions.assertFalse; import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import java.io.IOException; import java.io.IOException;
@ -25,15 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest; import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -74,7 +64,7 @@ public class MappersTest {
assertValidId(p.getId()); assertValidId(p.getId());
assertEquals(1, p.getOriginalId().size()); assertEquals(2, p.getOriginalId().size());
assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718")); assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718"));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
@ -261,8 +251,8 @@ public class MappersTest {
final Relation r2 = (Relation) list.get(2); final Relation r2 = (Relation) list.get(2);
assertValidId(d.getId()); assertValidId(d.getId());
assertEquals(1, d.getOriginalId().size()); assertEquals(2, d.getOriginalId().size());
assertTrue(d.getOriginalId().contains("oai:zenodo.org:3234526")); assertTrue(d.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:zenodo.org:3234526")));
assertValidId(d.getCollectedfrom().get(0).getKey()); assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
assertTrue(d.getAuthor().size() > 0); assertTrue(d.getAuthor().size() > 0);
@ -351,8 +341,11 @@ public class MappersTest {
final Publication p = (Publication) list.get(0); final Publication p = (Publication) list.get(0);
assertValidId(p.getId()); assertValidId(p.getId());
assertTrue(p.getOriginalId().size() == 1); assertEquals(2, p.getOriginalId().size());
assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739")));
// assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(p.getAuthor().size() > 0); assertTrue(p.getAuthor().size() > 0);
@ -413,7 +406,8 @@ public class MappersTest {
assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, d.getDataInfo().getProvenanceaction().getSchemename()); assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, d.getDataInfo().getProvenanceaction().getSchemename());
assertValidId(d.getId()); assertValidId(d.getId());
assertTrue(d.getOriginalId().size() == 1); assertEquals(2, d.getOriginalId().size());
assertEquals("feabb67c-1fd1-423b-aec6-606d04ce53c6", d.getOriginalId().get(0)); assertEquals("feabb67c-1fd1-423b-aec6-606d04ce53c6", d.getOriginalId().get(0));
assertValidId(d.getCollectedfrom().get(0).getKey()); assertValidId(d.getCollectedfrom().get(0).getKey());
@ -567,31 +561,6 @@ public class MappersTest {
assertNotNull(d.getInstance().get(0).getUrl()); assertNotNull(d.getInstance().get(0).getUrl());
} }
@Test
void testEnermaps() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Dataset);
final Dataset d = (Dataset) list.get(0);
assertValidId(d.getId());
assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
assertEquals(1, d.getAuthor().size());
assertEquals(1, d.getInstance().size());
assertNotNull(d.getInstance().get(0).getUrl());
assertNotNull(d.getContext());
assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId()));
assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId());
}
@Test @Test
void testClaimFromCrossref() throws IOException { void testClaimFromCrossref() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
@ -673,6 +642,30 @@ public class MappersTest {
System.out.println(p.getTitle().get(0).getValue()); System.out.println(p.getTitle().get(0).getValue());
} }
@Test
void testJairo() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertTrue(p.getTitle().size() == 1);
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
final Publication p_cleaned = cleanup(fixVocabularyNames(p));
assertNotNull(p_cleaned.getTitle());
assertFalse(p_cleaned.getTitle().isEmpty());
}
@Test @Test
void testOdfFromHdfs() throws IOException { void testOdfFromHdfs() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
@ -688,8 +681,8 @@ public class MappersTest {
final Dataset p = (Dataset) list.get(0); final Dataset p = (Dataset) list.get(0);
assertValidId(p.getId()); assertValidId(p.getId());
assertTrue(p.getOriginalId().size() == 1); assertEquals(2, p.getOriginalId().size());
assertEquals("df76e73f-0483-49a4-a9bb-63f2f985574a", p.getOriginalId().get(0)); assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("df76e73f-0483-49a4-a9bb-63f2f985574a")));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(p.getAuthor().size() > 0); assertTrue(p.getAuthor().size() > 0);

View File

@ -0,0 +1,115 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PatchRelationApplicationTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String ID_MAPPING_PATH = "map/id_mapping.json";
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(PatchRelationApplicationTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(PatchRelationApplicationTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PatchRelationApplicationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PatchRelationApplicationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
FileUtils
.copyInputStreamToFile(
PatchRelationApplicationTest.class.getResourceAsStream("id_mapping.json"),
workingDir.resolve(ID_MAPPING_PATH).toFile());
FileUtils
.copyInputStreamToFile(
PatchRelationApplicationTest.class.getResourceAsStream("relations_to_patch.json"),
workingDir.resolve("graphBasePath/relation/rels.json").toFile());
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testPatchRelationApplication() throws Exception {
final String graphBasePath = workingDir.toString() + "/graphBasePath";
PatchRelationsApplication.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-graphBasePath", graphBasePath,
"-workingDir", workingDir.toString() + "/workingDir",
"-idMappingPath", workingDir.toString() + "/" + ID_MAPPING_PATH
});
final List<Relation> rels = spark
.read()
.textFile(graphBasePath + "/relation")
.map(
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
Encoders.bean(Relation.class))
.collectAsList();
assertEquals(6, rels.size());
assertEquals(0, getCount(rels, "1a"), "should be patched to 1b");
assertEquals(0, getCount(rels, "2a"), "should be patched to 2b");
assertEquals(2, getCount(rels, "10a"), "not included in patching");
assertEquals(2, getCount(rels, "20a"), "not included in patching");
assertEquals(2, getCount(rels, "15a"), "not included in patching");
assertEquals(2, getCount(rels, "25a"), "not included in patching");
assertEquals(2, getCount(rels, "1b"), "patched from 1a");
assertEquals(2, getCount(rels, "2b"), "patched from 2a");
}
private long getCount(List<Relation> rels, final String id) {
return rels.stream().filter(r -> r.getSource().equals(id) || r.getTarget().equals(id)).count();
}
}

View File

@ -0,0 +1,5 @@
{"oldId": "1a", "newId": "1b"}
{"oldId": "2a", "newId": "2b"}
{"oldId": "3a", "newId": "3b"}
{"oldId": "4a", "newId": "4b"}
{"oldId": "5a", "newId": "5b"}

View File

@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
<dc:creator>上原, 剛</dc:creator>
<dc:creator>甲斐, 博</dc:creator>
<dc:creator>野田, 松太郎</dc:creator>
<dc:format>application/pdf</dc:format>
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
<dc:language>jpn</dc:language>
<dc:publisher>京都大学数理解析研究所</dc:publisher>
<dc:subject classid="ndc" classname="ndc"
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
<dc:type>Departmental Bulletin Paper</dc:type>
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
<oaf:projectid/>
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
</metadata>
<about>
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
<datestamp>2021-04-13T13:36:29Z</datestamp>
<metadataNamespace/>
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
<datestamp>2012-07-12T14:15:41Z</datestamp>
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
</originDescription>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,6 @@
{"source":"1a","target":"10a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
{"source":"10a","target":"1a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
{"source":"2a","target":"20a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
{"source":"20a","target":"2a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
{"source":"15a","target":"25a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
{"source":"25a","target":"15a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,7 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -81,6 +82,7 @@ public class PrepareRelationsJob {
Set<String> relationFilter = Optional Set<String> relationFilter = Optional
.ofNullable(parser.get("relationFilter")) .ofNullable(parser.get("relationFilter"))
.map(String::toLowerCase)
.map(s -> Sets.newHashSet(Splitter.on(",").split(s))) .map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
.orElse(new HashSet<>()); .orElse(new HashSet<>());
log.info("relationFilter: {}", relationFilter); log.info("relationFilter: {}", relationFilter);
@ -130,7 +132,7 @@ public class PrepareRelationsJob {
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath) JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false); .filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false);
JavaRDD<Relation> pruned = pruneRels( JavaRDD<Relation> pruned = pruneRels(
pruneRels( pruneRels(

View File

@ -16,7 +16,6 @@ import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource; import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
@ -43,6 +42,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class XmlRecordFactory implements Serializable { public class XmlRecordFactory implements Serializable {

View File

@ -7,8 +7,6 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -25,6 +23,7 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
@ -137,9 +136,10 @@ public class XmlRecordFactoryTest {
@Test @Test
public void testEnermapsRecord() throws IOException, DocumentException { public void testEnermapsRecord() throws IOException, DocumentException {
String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>" + String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>"+ +
"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>"+ "<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
"</entries>"; "</entries>";
ContextMapper contextMapper = ContextMapper.fromXml(contextmap); ContextMapper contextMapper = ContextMapper.fromXml(contextmap);

View File

@ -205,6 +205,11 @@
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
<version>1.0.7</version> <version>1.0.7</version>
</dependency> </dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
<version>0.0.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>