1
0
Fork 0

bulktagging wfs moved into common dhp-enrichment module

This commit is contained in:
Claudio Atzori 2020-05-11 17:32:06 +02:00
parent c403971c2f
commit 6d0b11252e
135 changed files with 550 additions and 796 deletions

View File

@ -16,16 +16,23 @@
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the graph output path</description>
</property>
</parameters>
<start to="reset-outputpath"/>
<start to="reset_outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset-outputpath">
<action name="reset_outputpath">
<fs>
<delete path='${workingDir}/blacklist'/>
<delete path="${workingDir}/blacklist"/>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="read_blacklist"/>
<error to="Kill"/>
@ -87,12 +94,14 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,7 +0,0 @@
#sandboxName when not provided explicitly will be generated
sandboxName=${sandboxName}
sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName}
workingDir=${sandboxDir}/working_dir
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
oozieTopWfApplicationPath = ${oozie.wf.application.path}

View File

@ -9,7 +9,7 @@
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-bulktag</artifactId>
<artifactId>dhp-enrichment</artifactId>
<dependencies>
<dependency>
@ -31,6 +31,12 @@
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
@ -43,23 +49,16 @@
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.11</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.3-jre</version>
</dependency>
<dependency>
<groupId>io.github.classgraph</groupId>
<artifactId>classgraph</artifactId>
<version>4.8.71</version>
</dependency>
</dependencies>
</project>

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
@ -15,12 +16,9 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import java.util.Optional;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.community.*;
import eu.dnetlib.dhp.schema.oaf.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class SparkBulkTagJob {

View File

@ -1,15 +1,14 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
/** Created by miriam on 01/08/2018. */
public class Community implements Serializable {
@ -17,7 +16,7 @@ public class Community implements Serializable {
private String id;
private List<String> subjects = new ArrayList<>();
private List<Datasource> datasources = new ArrayList<>();
private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
public String toJson() {
@ -27,7 +26,7 @@ public class Community implements Serializable {
public boolean isValid() {
return !getSubjects().isEmpty()
|| !getDatasources().isEmpty()
|| !getProviders().isEmpty()
|| !getZenodoCommunities().isEmpty();
}
@ -47,12 +46,12 @@ public class Community implements Serializable {
this.subjects = subjects;
}
public List<Datasource> getDatasources() {
return datasources;
public List<Provider> getProviders() {
return providers;
}
public void setDatasources(List<Datasource> datasources) {
this.datasources = datasources;
public void setProviders(List<Provider> providers) {
this.providers = providers;
}
public List<ZenodoCommunity> getZenodoCommunities() {

View File

@ -1,5 +1,14 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable;
import java.util.ArrayList;
@ -8,17 +17,6 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
/** Created by miriam on 02/08/2018. */
public class CommunityConfiguration implements Serializable {
@ -84,7 +82,7 @@ public class CommunityConfiguration implements Serializable {
add(sbj.toLowerCase().trim(), p, subjectMap);
}
// get datasources
for (Datasource d : c.getDatasources()) {
for (Provider d : c.getProviders()) {
add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
}

View File

@ -1,11 +1,14 @@
package eu.dnetlib.dhp.community;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
package eu.dnetlib.dhp.bulktag.community;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -14,15 +17,10 @@ import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactory {
@ -77,7 +75,7 @@ public class CommunityConfigurationFactory {
log.info(String.format("community id: %s", c.getId()));
c.setSubjects(parseSubjects(node));
c.setDatasources(parseDatasources(node));
c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node));
return c;
}
@ -96,17 +94,17 @@ public class CommunityConfigurationFactory {
return subjects;
}
private static List<Datasource> parseDatasources(final Node node) {
private static List<Provider> parseDatasources(final Node node) {
final List<Node> list = node.selectNodes("./datasources/datasource");
final List<Datasource> datasourceList = new ArrayList<>();
final List<Provider> providerList = new ArrayList<>();
for (Node n : list) {
Datasource d = new Datasource();
Provider d = new Provider();
d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
datasourceList.add(d);
providerList.add(d);
}
log.info("size of the datasource list " + datasourceList.size());
return datasourceList;
log.info("size of the datasource list " + providerList.size());
return providerList;
}
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {

View File

@ -1,12 +1,12 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class Constraint implements Serializable {
private String verb;
private String field;

View File

@ -1,5 +1,11 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
@ -8,14 +14,6 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 02/08/2018. */
public class Constraints implements Serializable {
private static final Log log = LogFactory.getLog(Constraints.class);

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import java.io.Serializable;
/** Created by miriam on 03/08/2018. */
public class Pair<A, B> implements Serializable {
private A fst;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable;
import java.util.HashMap;

View File

@ -1,19 +1,17 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Node;
import com.google.gson.Gson;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.Serializable;
/** Created by miriam on 01/08/2018. */
public class Datasource implements Serializable {
private static final Log log = LogFactory.getLog(Datasource.class);
public class Provider implements Serializable {
private static final Log log = LogFactory.getLog(Provider.class);
private String openaireId;

View File

@ -1,15 +1,13 @@
package eu.dnetlib.dhp.community;
import java.util.List;
import org.dom4j.DocumentException;
package eu.dnetlib.dhp.bulktag.community;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.dom4j.DocumentException;
import java.util.List;
public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "

View File

@ -1,20 +1,19 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import static eu.dnetlib.dhp.community.TagginConstants.*;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
@ -239,8 +238,8 @@ public class ResultTagger implements Serializable {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID);
pa.setSchemename(DNET_SCHEMA_NAME);
pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
pa.setSchemename(DNET_PROVENANCE_ACTIONS);
return pa;
}
}

View File

@ -1,5 +1,9 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import java.io.Serializable;
import java.lang.reflect.Type;
@ -7,11 +11,6 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class SelectionConstraints implements Serializable {
private List<Constraints> criteria;

View File

@ -1,20 +1,14 @@
package eu.dnetlib.dhp.community;
package eu.dnetlib.dhp.bulktag.community;
public class TagginConstants {
public class TaggingConstants {
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
public static final String CLASS_ID_SUBJECT = "community:subject";
public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String SCHEMA_ID = "dnet:provenanceActions";
public static final String COUNTER_GROUP = "Bulk Tagging";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.community;
import java.io.Serializable;
import org.dom4j.Node;
package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import org.dom4j.Node;
import java.io.Serializable;
/** Created by miriam on 01/08/2018. */
public class ZenodoCommunity implements Serializable {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.lang.reflect.Type;
package eu.dnetlib.dhp.bulktag.criteria;
import com.google.gson.*;
import java.lang.reflect.Type;
public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
private static final String CLASSNAME = "CLASSNAME";

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
public interface Selection {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
package eu.dnetlib.dhp.bulktag.criteria;
import io.github.classgraph.ClassGraph;
import io.github.classgraph.ClassInfo;
import io.github.classgraph.ClassInfoList;
import io.github.classgraph.ScanResult;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
public class VerbResolver implements Serializable {
private Map<String, Class<Selection>> map = null; // = new HashMap<>();
private final ClassGraph classgraph = new ClassGraph();

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria;
package eu.dnetlib.dhp.bulktag.criteria;
public class VerbResolverFactory {

View File

@ -1,13 +1,13 @@
package eu.dnetlib.dhp;
import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
package eu.dnetlib.dhp.bulktag;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@ -18,37 +18,44 @@ import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.mortbay.util.IO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader();
public static final String MOCK_IS_LOOK_UP_URL = "BASEURL:8280/is/services/isLookUp";
public static final String pathMap =
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}";
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class);
private static final Logger log = LoggerFactory.getLogger(BulkTagJobTest.class);
private static String taggingConf = "";
static {
try {
taggingConf = IO
taggingConf = IOUtils
.toString(
BulkTagJobTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml"));
} catch (IOException e) {
e.printStackTrace();
}
@ -56,11 +63,11 @@ public class BulkTagJobTest {
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
workingDir = Files.createTempDirectory(BulkTagJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName());
conf.setAppName(BulkTagJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
@ -84,34 +91,21 @@ public class BulkTagJobTest {
@Test
public void noUpdatesTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
// "-preparedInfoPath",
// getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates").getPath(),
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
@ -134,34 +128,24 @@ public class BulkTagJobTest {
@Test
public void bulktagBySubjectNoPreviousContextTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext")
.getPath();
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
@ -240,32 +224,22 @@ public class BulkTagJobTest {
@Test
public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance")
.getPath();
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -332,34 +306,23 @@ public class BulkTagJobTest {
@Test
public void bulktagByDatasourceTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource")
.getPath();
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/publication/update_datasource")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/publication",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath", workingDir.toString() + "/publication",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
@ -415,35 +378,24 @@ public class BulkTagJobTest {
@Test
public void bulktagByZenodoCommunityTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity")
.getPath();
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
"-outputPath",
workingDir.toString() + "/orp",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
"-outputPath", workingDir.toString() + "/orp",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<OtherResearchProduct> tmp = sc
.textFile(workingDir.toString() + "/orp")
@ -548,34 +500,23 @@ public class BulkTagJobTest {
@Test
public void bulktagBySubjectDatasourceTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource")
.getPath();
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
@ -691,29 +632,17 @@ public class BulkTagJobTest {
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-outputPath",
workingDir.toString() + "/software",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/software/").getPath(),
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
"-outputPath", workingDir.toString() + "/software",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/software")
@ -796,35 +725,24 @@ public class BulkTagJobTest {
@Test
public void bulktagDatasourcewithConstraintsTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
.getPath();
SparkBulkTagJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints")
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")

View File

@ -1,23 +1,21 @@
package eu.dnetlib.dhp;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
package eu.dnetlib.dhp.bulktag;
import com.google.gson.Gson;
import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration;
import eu.dnetlib.dhp.bulktag.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.bulktag.community.Constraint;
import eu.dnetlib.dhp.bulktag.community.SelectionConstraints;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.google.gson.Gson;
import eu.dnetlib.dhp.community.CommunityConfiguration;
import eu.dnetlib.dhp.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.community.Constraint;
import eu.dnetlib.dhp.community.SelectionConstraints;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
/** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactoryTest {

View File

@ -5,12 +5,15 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
@ -26,12 +29,11 @@ import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
public class CountryPropagationJobTest {
private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = CountryPropagationJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
@ -101,8 +103,8 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
Dataset<String> countryExploded = verificationDs
.flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class))
.map(c -> c.getClassid(), Encoders.STRING());
.flatMap((FlatMapFunction<Software, Country>) row -> row.getCountry().iterator(), Encoders.bean(Country.class))
.map((MapFunction<Country, String>) c -> c.getClassid(), Encoders.STRING());
Assertions.assertEquals(9, countryExploded.count());
@ -115,8 +117,7 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs
.flatMap(
row -> {
.flatMap((FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList();
List<Country> country_list = row.getCountry();
country_list
@ -127,8 +128,7 @@ public class CountryPropagationJobTest {
new Tuple2<>(
row.getId(), c.getClassid())));
return prova.iterator();
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
@ -178,7 +178,7 @@ public class CountryPropagationJobTest {
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs
.flatMap(
row -> {
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList();
List<Country> country_list = row.getCountry();
country_list
@ -239,7 +239,7 @@ public class CountryPropagationJobTest {
Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs
.flatMap(
row -> {
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList();
List<Country> country_list = row.getCountry();
country_list

View File

@ -29,8 +29,6 @@ public class OrcidPropagationJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;

View File

@ -9,6 +9,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
@ -29,8 +30,6 @@ public class ProjectPropagationJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = ProjectPropagationJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
@ -72,34 +71,26 @@ public class ProjectPropagationJobTest {
@Test
public void NoUpdateTest() throws Exception {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
// "-sourcePath",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
final String potentialUpdateDate = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdateDate,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
@ -115,34 +106,26 @@ public class ProjectPropagationJobTest {
*/
@Test
public void UpdateTenTest() throws Exception {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
// "-sourcePath",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
final String potentialUpdatePath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatePath,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
@ -160,18 +143,18 @@ public class ProjectPropagationJobTest {
.assertEquals(
5,
verificationDs
.filter(
r -> r.getSource().substring(0, 2).equals("50")
&& r.getTarget().substring(0, 2).equals("40")
.filter((FilterFunction<Relation>) r ->
r.getSource().startsWith("50")
&& r.getTarget().startsWith("40")
&& r.getRelClass().equals("isProducedBy"))
.count());
Assertions
.assertEquals(
5,
verificationDs
.filter(
r -> r.getSource().substring(0, 2).equals("40")
&& r.getTarget().substring(0, 2).equals("50")
.filter((FilterFunction<Relation>) r ->
r.getSource().startsWith("40")
&& r.getTarget().startsWith("50")
&& r.getRelClass().equals("produces"))
.count());
@ -194,34 +177,26 @@ public class ProjectPropagationJobTest {
*/
@Test
public void UpdateMixTest() throws Exception {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
// "-sourcePath",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
final String potentialUpdatepath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatepath,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
@ -242,18 +217,18 @@ public class ProjectPropagationJobTest {
.assertEquals(
4,
verificationDs
.filter(
r -> r.getSource().substring(0, 2).equals("50")
&& r.getTarget().substring(0, 2).equals("40")
.filter((FilterFunction<Relation>) r ->
r.getSource().startsWith("50")
&& r.getTarget().startsWith("40")
&& r.getRelClass().equals("isProducedBy"))
.count());
Assertions
.assertEquals(
4,
verificationDs
.filter(
r -> r.getSource().substring(0, 2).equals("40")
&& r.getTarget().substring(0, 2).equals("50")
.filter((FilterFunction<Relation>) r ->
r.getSource().startsWith("40")
&& r.getTarget().startsWith("50")
&& r.getRelClass().equals("produces"))
.count());

View File

@ -32,8 +32,6 @@ public class ResultToCommunityJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = ResultToCommunityJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
@ -68,33 +66,24 @@ public class ResultToCommunityJobTest {
@Test
public void testSparkResultToCommunityFromOrganizationJob() throws Exception {
SparkResultToCommunityFromOrganizationJob
.main(
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo")
.getPath();
SparkResultToCommunityFromOrganizationJob.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample")
.getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-preparedInfoPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo")
.getPath()
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-preparedInfoPath", preparedInfoPath
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
@ -217,13 +206,6 @@ public class ResultToCommunityJobTest {
.get(0)
.getString(0));
/*
* {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::8d817039a63710fcf97e30f14662c6c8"}
* "context" ["id": euromarine] updates = 1
* {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6"} context
* = [ni, euromarine] updates = 1
*/
query = "select id, MyT.id community "
+ "from dataset "
+ "lateral view explode(context) c as MyT "

View File

@ -29,32 +29,21 @@ import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ResultToCommunityJobTest {
private static final Logger log = LoggerFactory
.getLogger(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class);
private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getSimpleName());
workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf
.setAppName(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getSimpleName());
conf.setAppName(ResultToCommunityJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
@ -65,7 +54,7 @@ public class ResultToCommunityJobTest {
spark = SparkSession
.builder()
.appName(OrcidPropagationJobTest.class.getSimpleName())
.appName(ResultToCommunityJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@ -83,22 +72,18 @@ public class ResultToCommunityJobTest {
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample")
"-sourcePath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample")
.getPath(),
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-preparedInfoPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo")
"-preparedInfoPath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo")
.getPath()
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")

View File

@ -23,23 +23,19 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class Result2OrganizationJobTest {
public class ResultToOrganizationJobTest {
private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class);
private static final Logger log = LoggerFactory.getLogger(ResultToOrganizationJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(
SparkResultToOrganizationFromIstRepoJob.class.getSimpleName());
workingDir = Files.createTempDirectory(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
@ -72,40 +68,29 @@ public class Result2OrganizationJobTest {
*/
@Test
public void NoUpdateTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob
.main(
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath(),
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked")
.getPath(),
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-datasourceOrganizationPath", datasourceOrganizationPath,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
@ -123,40 +108,29 @@ public class Result2OrganizationJobTest {
*/
@Test
public void UpdateNoMixTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob
.main(
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath(),
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked")
.getPath(),
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-datasourceOrganizationPath", datasourceOrganizationPath,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")
@ -197,40 +171,29 @@ public class Result2OrganizationJobTest {
@Test
public void UpdateMixTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob
.main(
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] {
"-isTest",
Boolean.TRUE.toString(),
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix")
.getPath(),
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked")
.getPath(),
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-datasourceOrganizationPath", datasourceOrganizationPath,
"-alreadyLinkedPath", alreadyLinkedPath,
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation")

View File

@ -2,17 +2,17 @@
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="ee">
@ -35,7 +35,7 @@
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<providers/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
@ -45,12 +45,12 @@
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="ni">
@ -74,7 +74,7 @@
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
@ -95,7 +95,7 @@
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="mes">
@ -106,12 +106,12 @@
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="aginfra">
@ -134,7 +134,7 @@
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
@ -159,18 +159,18 @@
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
</communities>

View File

@ -2,17 +2,17 @@
<community id="fet-fp7">
<oacommunity/>
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="fet-h2020">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="oa-pg">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="ee">
@ -35,7 +35,7 @@
<subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject>
</subjects>
<datasources/>
<providers/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>123</zenodoid>
@ -45,12 +45,12 @@
</community>
<community id="dh-ch">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="fam">
<subjects/>
<datasources/>
<providers/>
<zenodocommunities/>
</community>
<community id="ni">
@ -74,7 +74,7 @@
<subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/>
@ -95,7 +95,7 @@
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="mes">
@ -106,12 +106,12 @@
<subject>aqua</subject>
<subject>sea</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="aginfra">
@ -134,7 +134,7 @@
<subject>food distribution</subject>
<subject>forestry</subject>
</subjects>
<datasources>
<providers>
<datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/>
@ -159,30 +159,30 @@
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="clarin">
<oacommunity>oac_clarin</oacommunity>
<subjects/>
<datasources>
<providers>
<datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/>
</datasource>
</datasources>
</providers>
<zenodocommunities/>
</community>
<community id="dariah">
<oacommunity>oaa_dariah</oacommunity>
<subjects/>
<datasources>
<providers>
<datasource>
<openaireId>openaire____::1cfdb2e14977f31a98e0118283401f32</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
</selcriteria>
</datasource>
</datasources>
</providers>
<zenodocommunities>
<zenodocommunity>
<zenodoid>dimpo</zenodoid>

Some files were not shown because too many files have changed in this diff Show More