bulktagging wfs moved into common dhp-enrichment module

This commit is contained in:
Claudio Atzori 2020-05-11 17:32:06 +02:00
parent c403971c2f
commit 6d0b11252e
135 changed files with 550 additions and 796 deletions

View File

@ -1,31 +1,38 @@
<workflow-app name="blacklisting" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="blacklisting" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>postgresURL</name> <name>postgresURL</name>
<description>the url of the postgress server to query</description> <description>the url of the postgress server to query</description>
</property> </property>
<property> <property>
<name>postgresUser</name> <name>postgresUser</name>
<description>the username to access the postgres db</description> <description>the username to access the postgres db</description>
</property> </property>
<property> <property>
<name>postgresPassword</name> <name>postgresPassword</name>
<description>the postgres password</description> <description>the postgres password</description>
</property> </property>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
<description>the source path</description> <description>the source path</description>
</property> </property>
</parameters> <property>
<start to="reset-outputpath"/> <name>outputPath</name>
<description>the graph output path</description>
</property>
</parameters>
<kill name="Kill"> <start to="reset_outputpath"/>
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset-outputpath"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs> <fs>
<delete path='${workingDir}/blacklist'/> <delete path="${workingDir}/blacklist"/>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs> </fs>
<ok to="read_blacklist"/> <ok to="read_blacklist"/>
<error to="Kill"/> <error to="Kill"/>
@ -87,12 +94,14 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg> <arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg> <arg>--mergesPath</arg><arg>${workingDir}/mergesRelation</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/>
<end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,7 +0,0 @@
#sandboxName when not provided explicitly will be generated
sandboxName=${sandboxName}
sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName}
workingDir=${sandboxDir}/working_dir
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
oozieTopWfApplicationPath = ${oozie.wf.application.path}

View File

@ -9,7 +9,7 @@
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-bulktag</artifactId> <artifactId>dhp-enrichment</artifactId>
<dependencies> <dependencies>
<dependency> <dependency>
@ -31,6 +31,12 @@
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
<dependency> <dependency>
<groupId>dom4j</groupId> <groupId>dom4j</groupId>
<artifactId>dom4j</artifactId> <artifactId>dom4j</artifactId>
@ -43,23 +49,16 @@
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.11</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.3-jre</version>
</dependency>
<dependency> <dependency>
<groupId>io.github.classgraph</groupId> <groupId>io.github.classgraph</groupId>
<artifactId>classgraph</artifactId> <artifactId>classgraph</artifactId>
<version>4.8.71</version> <version>4.8.71</version>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.bulktag; package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import java.util.Optional; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -15,12 +16,9 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import java.util.Optional;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.community.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class SparkBulkTagJob { public class SparkBulkTagJob {

View File

@ -1,15 +1,14 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
/** Created by miriam on 01/08/2018. */ /** Created by miriam on 01/08/2018. */
public class Community implements Serializable { public class Community implements Serializable {
@ -17,7 +16,7 @@ public class Community implements Serializable {
private String id; private String id;
private List<String> subjects = new ArrayList<>(); private List<String> subjects = new ArrayList<>();
private List<Datasource> datasources = new ArrayList<>(); private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
public String toJson() { public String toJson() {
@ -27,7 +26,7 @@ public class Community implements Serializable {
public boolean isValid() { public boolean isValid() {
return !getSubjects().isEmpty() return !getSubjects().isEmpty()
|| !getDatasources().isEmpty() || !getProviders().isEmpty()
|| !getZenodoCommunities().isEmpty(); || !getZenodoCommunities().isEmpty();
} }
@ -47,12 +46,12 @@ public class Community implements Serializable {
this.subjects = subjects; this.subjects = subjects;
} }
public List<Datasource> getDatasources() { public List<Provider> getProviders() {
return datasources; return providers;
} }
public void setDatasources(List<Datasource> datasources) { public void setProviders(List<Provider> providers) {
this.datasources = datasources; this.providers = providers;
} }
public List<ZenodoCommunity> getZenodoCommunities() { public List<ZenodoCommunity> getZenodoCommunities() {

View File

@ -1,5 +1,14 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
@ -8,17 +17,6 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
/** Created by miriam on 02/08/2018. */ /** Created by miriam on 02/08/2018. */
public class CommunityConfiguration implements Serializable { public class CommunityConfiguration implements Serializable {
@ -84,7 +82,7 @@ public class CommunityConfiguration implements Serializable {
add(sbj.toLowerCase().trim(), p, subjectMap); add(sbj.toLowerCase().trim(), p, subjectMap);
} }
// get datasources // get datasources
for (Datasource d : c.getDatasources()) { for (Provider d : c.getProviders()) {
add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap); add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap);
} }

View File

@ -1,11 +1,14 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -14,15 +17,10 @@ import org.dom4j.DocumentException;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import com.google.common.collect.Lists; import java.io.StringReader;
import com.google.common.collect.Maps; import java.util.ArrayList;
import com.google.gson.Gson; import java.util.List;
import com.google.gson.GsonBuilder; import java.util.Map;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory;
/** Created by miriam on 03/08/2018. */ /** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactory { public class CommunityConfigurationFactory {
@ -77,7 +75,7 @@ public class CommunityConfigurationFactory {
log.info(String.format("community id: %s", c.getId())); log.info(String.format("community id: %s", c.getId()));
c.setSubjects(parseSubjects(node)); c.setSubjects(parseSubjects(node));
c.setDatasources(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
return c; return c;
} }
@ -96,17 +94,17 @@ public class CommunityConfigurationFactory {
return subjects; return subjects;
} }
private static List<Datasource> parseDatasources(final Node node) { private static List<Provider> parseDatasources(final Node node) {
final List<Node> list = node.selectNodes("./datasources/datasource"); final List<Node> list = node.selectNodes("./datasources/datasource");
final List<Datasource> datasourceList = new ArrayList<>(); final List<Provider> providerList = new ArrayList<>();
for (Node n : list) { for (Node n : list) {
Datasource d = new Datasource(); Provider d = new Provider();
d.setOpenaireId(n.selectSingleNode("./openaireId").getText()); d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver); d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
datasourceList.add(d); providerList.add(d);
} }
log.info("size of the datasource list " + datasourceList.size()); log.info("size of the datasource list " + providerList.size());
return datasourceList; return providerList;
} }
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) { private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {

View File

@ -1,12 +1,12 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class Constraint implements Serializable { public class Constraint implements Serializable {
private String verb; private String verb;
private String field; private String field;

View File

@ -1,5 +1,11 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
@ -8,14 +14,6 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 02/08/2018. */ /** Created by miriam on 02/08/2018. */
public class Constraints implements Serializable { public class Constraints implements Serializable {
private static final Log log = LogFactory.getLog(Constraints.class); private static final Log log = LogFactory.getLog(Constraints.class);

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable;
import com.google.gson.Gson; import com.google.gson.Gson;
import java.io.Serializable;
/** Created by miriam on 03/08/2018. */ /** Created by miriam on 03/08/2018. */
public class Pair<A, B> implements Serializable { public class Pair<A, B> implements Serializable {
private A fst; private A fst;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;

View File

@ -1,19 +1,17 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable;
import com.google.gson.Gson;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.dom4j.Node; import org.dom4j.Node;
import com.google.gson.Gson; import java.io.Serializable;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 01/08/2018. */ /** Created by miriam on 01/08/2018. */
public class Datasource implements Serializable { public class Provider implements Serializable {
private static final Log log = LogFactory.getLog(Datasource.class); private static final Log log = LogFactory.getLog(Provider.class);
private String openaireId; private String openaireId;

View File

@ -1,15 +1,13 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.util.List;
import org.dom4j.DocumentException;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.dom4j.DocumentException;
import java.util.List;
public class QueryInformationSystem { public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "

View File

@ -1,20 +1,19 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import static eu.dnetlib.dhp.community.TagginConstants.*; import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
/** Created by miriam on 02/08/2018. */ /** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable { public class ResultTagger implements Serializable {
@ -51,7 +50,7 @@ public class ResultTagger implements Serializable {
} }
public <R extends Result> R enrichContextCriteria( public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, String> criteria) { final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {
// } // }
// public Result enrichContextCriteria(final Result result, final CommunityConfiguration // public Result enrichContextCriteria(final Result result, final CommunityConfiguration
@ -239,8 +238,8 @@ public class ResultTagger implements Serializable {
Qualifier pa = new Qualifier(); Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id); pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name); pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID); pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
pa.setSchemename(DNET_SCHEMA_NAME); pa.setSchemename(DNET_PROVENANCE_ACTIONS);
return pa; return pa;
} }
} }

View File

@ -1,5 +1,9 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.Type; import java.lang.reflect.Type;
@ -7,11 +11,6 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
public class SelectionConstraints implements Serializable { public class SelectionConstraints implements Serializable {
private List<Constraints> criteria; private List<Constraints> criteria;

View File

@ -1,20 +1,14 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
public class TagginConstants { public class TaggingConstants {
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public static final String DNET_SCHEMA_ID = "dnet:provenanceActions";
public static final String CLASS_ID_SUBJECT = "community:subject"; public static final String CLASS_ID_SUBJECT = "community:subject";
public static final String CLASS_ID_DATASOURCE = "community:datasource"; public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String SCHEMA_ID = "dnet:provenanceActions";
public static final String COUNTER_GROUP = "Bulk Tagging";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable;
import org.dom4j.Node;
import com.google.gson.Gson; import com.google.gson.Gson;
import org.dom4j.Node;
import java.io.Serializable;
/** Created by miriam on 01/08/2018. */ /** Created by miriam on 01/08/2018. */
public class ZenodoCommunity implements Serializable { public class ZenodoCommunity implements Serializable {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.lang.reflect.Type;
import com.google.gson.*; import com.google.gson.*;
import java.lang.reflect.Type;
public class InterfaceAdapter implements JsonSerializer, JsonDeserializer { public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
private static final String CLASSNAME = "CLASSNAME"; private static final String CLASSNAME = "CLASSNAME";

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
public interface Selection { public interface Selection {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
import java.lang.annotation.Retention; import java.lang.annotation.Retention;

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import io.github.classgraph.ClassGraph; import io.github.classgraph.ClassGraph;
import io.github.classgraph.ClassInfo; import io.github.classgraph.ClassInfo;
import io.github.classgraph.ClassInfoList; import io.github.classgraph.ClassInfoList;
import io.github.classgraph.ScanResult; import io.github.classgraph.ScanResult;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
public class VerbResolver implements Serializable { public class VerbResolver implements Serializable {
private Map<String, Class<Selection>> map = null; // = new HashMap<>(); private Map<String, Class<Selection>> map = null; // = new HashMap<>();
private final ClassGraph classgraph = new ClassGraph(); private final ClassGraph classgraph = new ClassGraph();

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.selectioncriteria; package eu.dnetlib.dhp.bulktag.criteria;
public class VerbResolverFactory { public class VerbResolverFactory {

View File

@ -1,13 +1,13 @@
package eu.dnetlib.dhp; package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -18,37 +18,44 @@ import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mortbay.util.IO;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob; import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
public class BulkTagJobTest { public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader(); public static final String MOCK_IS_LOOK_UP_URL = "BASEURL:8280/is/services/isLookUp";
public static final String pathMap =
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}";
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class);
private static final Logger log = LoggerFactory.getLogger(BulkTagJobTest.class);
private static String taggingConf = ""; private static String taggingConf = "";
static { static {
try { try {
taggingConf = IO taggingConf = IOUtils
.toString( .toString(
BulkTagJobTest.class BulkTagJobTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml"));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -56,11 +63,11 @@ public class BulkTagJobTest {
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); workingDir = Files.createTempDirectory(BulkTagJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); conf.setAppName(BulkTagJobTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
@ -84,34 +91,21 @@ public class BulkTagJobTest {
@Test @Test
public void noUpdatesTest() throws Exception { public void noUpdatesTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates").getPath(),
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(), "-outputPath", workingDir.toString() + "/dataset",
"-taggingConf", "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
taggingConf, "-pathMap", pathMap
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
// "-preparedInfoPath",
// getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath()
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
@ -134,34 +128,24 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagBySubjectNoPreviousContextTest() throws Exception { public void bulktagBySubjectNoPreviousContextTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext")
.getPath();
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
getClass() "-outputPath", workingDir.toString() + "/dataset",
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext") "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
.getPath(), "-pathMap", pathMap
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
@ -240,32 +224,22 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception { public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance")
.getPath();
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
getClass() "-outputPath", workingDir.toString() + "/dataset",
.getResource( "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance") "-pathMap", pathMap
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@ -332,34 +306,23 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagByDatasourceTest() throws Exception { public void bulktagByDatasourceTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource")
.getPath();
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
getClass() "-outputPath", workingDir.toString() + "/publication",
.getResource("/eu/dnetlib/dhp/sample/publication/update_datasource") "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
.getPath(), "-pathMap", pathMap
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/publication",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication") .textFile(workingDir.toString() + "/publication")
@ -415,35 +378,24 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagByZenodoCommunityTest() throws Exception { public void bulktagByZenodoCommunityTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity")
.getPath();
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
getClass() "-outputPath", workingDir.toString() + "/orp",
.getResource( "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity") "-pathMap", pathMap
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.OtherResearchProduct",
"-outputPath",
workingDir.toString() + "/orp",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<OtherResearchProduct> tmp = sc JavaRDD<OtherResearchProduct> tmp = sc
.textFile(workingDir.toString() + "/orp") .textFile(workingDir.toString() + "/orp")
@ -548,34 +500,23 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagBySubjectDatasourceTest() throws Exception { public void bulktagBySubjectDatasourceTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource")
.getPath();
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
getClass() "-outputPath", workingDir.toString() + "/dataset",
.getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource") "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
.getPath(), "-pathMap", pathMap
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
@ -691,29 +632,17 @@ public class BulkTagJobTest {
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/software/").getPath(),
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(), "-outputPath", workingDir.toString() + "/software",
"-taggingConf", "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
taggingConf, "-pathMap", pathMap
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-outputPath",
workingDir.toString() + "/software",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/software") .textFile(workingDir.toString() + "/software")
@ -796,35 +725,24 @@ public class BulkTagJobTest {
@Test @Test
public void bulktagDatasourcewithConstraintsTest() throws Exception { public void bulktagDatasourcewithConstraintsTest() throws Exception {
final String sourcePath = getClass()
.getResource(
"/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
.getPath();
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-taggingConf", taggingConf,
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
getClass() "-outputPath", workingDir.toString() + "/dataset",
.getResource( "-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints") "-pathMap", pathMap
.getPath(),
"-taggingConf",
taggingConf,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-isLookUpUrl",
"http://beta.services.openaire.eu:8280/is/services/isLookUp",
"-pathMap",
"{ \"author\" : \"$['author'][*]['fullname']\","
+ " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")

View File

@ -1,23 +1,21 @@
package eu.dnetlib.dhp; package eu.dnetlib.dhp.bulktag;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import com.google.gson.Gson;
import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration;
import eu.dnetlib.dhp.bulktag.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.bulktag.community.Constraint;
import eu.dnetlib.dhp.bulktag.community.SelectionConstraints;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.google.gson.Gson; import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import eu.dnetlib.dhp.community.CommunityConfiguration; import java.util.*;
import eu.dnetlib.dhp.community.CommunityConfigurationFactory;
import eu.dnetlib.dhp.community.Constraint;
import eu.dnetlib.dhp.community.SelectionConstraints;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
/** Created by miriam on 03/08/2018. */ /** Created by miriam on 03/08/2018. */
public class CommunityConfigurationFactoryTest { public class CommunityConfigurationFactoryTest {

View File

@ -5,12 +5,15 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -26,12 +29,11 @@ import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2; import scala.Tuple2;
public class CountryPropagationJobTest { public class CountryPropagationJobTest {
private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class); private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = CountryPropagationJobTest.class.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@ -101,8 +103,8 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count()); Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
Dataset<String> countryExploded = verificationDs Dataset<String> countryExploded = verificationDs
.flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class)) .flatMap((FlatMapFunction<Software, Country>) row -> row.getCountry().iterator(), Encoders.bean(Country.class))
.map(c -> c.getClassid(), Encoders.STRING()); .map((MapFunction<Country, String>) c -> c.getClassid(), Encoders.STRING());
Assertions.assertEquals(9, countryExploded.count()); Assertions.assertEquals(9, countryExploded.count());
@ -115,20 +117,18 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count()); Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs
.flatMap( .flatMap((FlatMapFunction<Software, Tuple2<String, String>>) row -> {
row -> { List<Tuple2<String, String>> prova = new ArrayList();
List<Tuple2<String, String>> prova = new ArrayList(); List<Country> country_list = row.getCountry();
List<Country> country_list = row.getCountry(); country_list
country_list .stream()
.stream() .forEach(
.forEach(
c -> prova c -> prova
.add( .add(
new Tuple2<>( new Tuple2<>(
row.getId(), c.getClassid()))); row.getId(), c.getClassid())));
return prova.iterator(); return prova.iterator();
}, }, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
@ -178,20 +178,20 @@ public class CountryPropagationJobTest {
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs
.flatMap( .flatMap(
row -> { (FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList(); List<Tuple2<String, String>> prova = new ArrayList();
List<Country> country_list = row.getCountry(); List<Country> country_list = row.getCountry();
country_list country_list
.stream() .stream()
.forEach( .forEach(
c -> prova c -> prova
.add( .add(
new Tuple2<>( new Tuple2<>(
row.getId(), row.getId(),
c.getClassname()))); c.getClassname())));
return prova.iterator(); return prova.iterator();
}, },
Encoders.tuple(Encoders.STRING(), Encoders.STRING())); Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
countryExplodedWithCountryclassname.show(false); countryExplodedWithCountryclassname.show(false);
Assertions Assertions
@ -239,22 +239,22 @@ public class CountryPropagationJobTest {
Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs
.flatMap( .flatMap(
row -> { (FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList(); List<Tuple2<String, String>> prova = new ArrayList();
List<Country> country_list = row.getCountry(); List<Country> country_list = row.getCountry();
country_list country_list
.stream() .stream()
.forEach( .forEach(
c -> prova c -> prova
.add( .add(
new Tuple2<>( new Tuple2<>(
row.getId(), row.getId(),
c c
.getDataInfo() .getDataInfo()
.getInferenceprovenance()))); .getInferenceprovenance())));
return prova.iterator(); return prova.iterator();
}, },
Encoders.tuple(Encoders.STRING(), Encoders.STRING())); Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions Assertions
.assertEquals( .assertEquals(

View File

@ -29,8 +29,6 @@ public class OrcidPropagationJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;

View File

@ -9,6 +9,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -29,8 +30,6 @@ public class ProjectPropagationJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = ProjectPropagationJobTest.class.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@ -72,34 +71,26 @@ public class ProjectPropagationJobTest {
@Test @Test
public void NoUpdateTest() throws Exception { public void NoUpdateTest() throws Exception {
SparkResultToProjectThroughSemRelJob final String potentialUpdateDate = getClass()
.main( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-hive_metastore_uris", "",
Boolean.FALSE.toString(), "-saveGraph", "true",
// "-sourcePath", "-outputPath", workingDir.toString() + "/relation",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), "-potentialUpdatePath", potentialUpdateDate,
"-hive_metastore_uris", "-alreadyLinkedPath", alreadyLinkedPath,
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
@ -115,34 +106,26 @@ public class ProjectPropagationJobTest {
*/ */
@Test @Test
public void UpdateTenTest() throws Exception { public void UpdateTenTest() throws Exception {
SparkResultToProjectThroughSemRelJob final String potentialUpdatePath = getClass()
.main( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-hive_metastore_uris", "",
Boolean.FALSE.toString(), "-saveGraph", "true",
// "-sourcePath", "-outputPath", workingDir.toString() + "/relation",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), "-potentialUpdatePath", potentialUpdatePath,
"-hive_metastore_uris", "-alreadyLinkedPath", alreadyLinkedPath,
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
@ -160,18 +143,18 @@ public class ProjectPropagationJobTest {
.assertEquals( .assertEquals(
5, 5,
verificationDs verificationDs
.filter( .filter((FilterFunction<Relation>) r ->
r -> r.getSource().substring(0, 2).equals("50") r.getSource().startsWith("50")
&& r.getTarget().substring(0, 2).equals("40") && r.getTarget().startsWith("40")
&& r.getRelClass().equals("isProducedBy")) && r.getRelClass().equals("isProducedBy"))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
5, 5,
verificationDs verificationDs
.filter( .filter((FilterFunction<Relation>) r ->
r -> r.getSource().substring(0, 2).equals("40") r.getSource().startsWith("40")
&& r.getTarget().substring(0, 2).equals("50") && r.getTarget().startsWith("50")
&& r.getRelClass().equals("produces")) && r.getRelClass().equals("produces"))
.count()); .count());
@ -194,34 +177,26 @@ public class ProjectPropagationJobTest {
*/ */
@Test @Test
public void UpdateMixTest() throws Exception { public void UpdateMixTest() throws Exception {
SparkResultToProjectThroughSemRelJob final String potentialUpdatepath = getClass()
.main( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath();
SparkResultToProjectThroughSemRelJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-hive_metastore_uris", "",
Boolean.FALSE.toString(), "-saveGraph", "true",
// "-sourcePath", "-outputPath", workingDir.toString() + "/relation",
// getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), "-potentialUpdatePath", potentialUpdatepath,
"-hive_metastore_uris", "-alreadyLinkedPath", alreadyLinkedPath,
"",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-potentialUpdatePath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
@ -242,18 +217,18 @@ public class ProjectPropagationJobTest {
.assertEquals( .assertEquals(
4, 4,
verificationDs verificationDs
.filter( .filter((FilterFunction<Relation>) r ->
r -> r.getSource().substring(0, 2).equals("50") r.getSource().startsWith("50")
&& r.getTarget().substring(0, 2).equals("40") && r.getTarget().startsWith("40")
&& r.getRelClass().equals("isProducedBy")) && r.getRelClass().equals("isProducedBy"))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
4, 4,
verificationDs verificationDs
.filter( .filter((FilterFunction<Relation>) r ->
r -> r.getSource().substring(0, 2).equals("40") r.getSource().startsWith("40")
&& r.getTarget().substring(0, 2).equals("50") && r.getTarget().startsWith("50")
&& r.getRelClass().equals("produces")) && r.getRelClass().equals("produces"))
.count()); .count());

View File

@ -32,8 +32,6 @@ public class ResultToCommunityJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = ResultToCommunityJobTest.class.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@ -68,33 +66,24 @@ public class ResultToCommunityJobTest {
@Test @Test
public void testSparkResultToCommunityFromOrganizationJob() throws Exception { public void testSparkResultToCommunityFromOrganizationJob() throws Exception {
SparkResultToCommunityFromOrganizationJob final String preparedInfoPath = getClass()
.main( .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo")
.getPath();
SparkResultToCommunityFromOrganizationJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", getClass()
Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample") .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample")
.getPath(), .getPath(),
"-hive_metastore_uris", "-hive_metastore_uris", "",
"", "-saveGraph", "true",
"-saveGraph", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"true", "-outputPath", workingDir.toString() + "/dataset",
"-resultTableName", "-preparedInfoPath", preparedInfoPath
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
workingDir.toString() + "/dataset",
"-preparedInfoPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo")
.getPath()
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
@ -217,13 +206,6 @@ public class ResultToCommunityJobTest {
.get(0) .get(0)
.getString(0)); .getString(0));
/*
* {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::8d817039a63710fcf97e30f14662c6c8"}
* "context" ["id": euromarine] updates = 1
* {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6"} context
* = [ni, euromarine] updates = 1
*/
query = "select id, MyT.id community " query = "select id, MyT.id community "
+ "from dataset " + "from dataset "
+ "lateral view explode(context) c as MyT " + "lateral view explode(context) c as MyT "

View File

@ -29,32 +29,21 @@ import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ResultToCommunityJobTest { public class ResultToCommunityJobTest {
private static final Logger log = LoggerFactory private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class);
.getLogger(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName());
.createTempDirectory(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf conf.setAppName(ResultToCommunityJobTest.class.getSimpleName());
.setAppName(
eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class
.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
@ -65,7 +54,7 @@ public class ResultToCommunityJobTest {
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(OrcidPropagationJobTest.class.getSimpleName()) .appName(ResultToCommunityJobTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@ -83,22 +72,18 @@ public class ResultToCommunityJobTest {
new String[] { new String[] {
"-isTest", Boolean.TRUE.toString(), "-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", "-sourcePath", getClass()
getClass() .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample")
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample")
.getPath(), .getPath(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset", "-outputPath", workingDir.toString() + "/dataset",
"-preparedInfoPath", "-preparedInfoPath", getClass()
getClass() .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo")
.getResource(
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo")
.getPath() .getPath()
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")

View File

@ -23,23 +23,19 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
public class Result2OrganizationJobTest { public class ResultToOrganizationJobTest {
private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class); private static final Logger log = LoggerFactory.getLogger(ResultToOrganizationJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader();
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files workingDir = Files.createTempDirectory(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName());
.createTempDirectory(
SparkResultToOrganizationFromIstRepoJob.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -72,40 +68,29 @@ public class Result2OrganizationJobTest {
*/ */
@Test @Test
public void NoUpdateTest() throws Exception { public void NoUpdateTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob final String sourcePath = getClass()
.main( .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-hive_metastore_uris", "",
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
getClass() "-saveGraph", "true",
.getResource( "-outputPath", workingDir.toString() + "/relation",
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") "-datasourceOrganizationPath", datasourceOrganizationPath,
.getPath(), "-alreadyLinkedPath", alreadyLinkedPath,
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
@ -123,40 +108,29 @@ public class Result2OrganizationJobTest {
*/ */
@Test @Test
public void UpdateNoMixTest() throws Exception { public void UpdateNoMixTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob final String sourcePath = getClass()
.main( .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-hive_metastore_uris", "",
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
getClass() "-saveGraph", "true",
.getResource( "-outputPath", workingDir.toString() + "/relation",
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") "-datasourceOrganizationPath", datasourceOrganizationPath,
.getPath(), "-alreadyLinkedPath", alreadyLinkedPath,
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")
@ -197,40 +171,29 @@ public class Result2OrganizationJobTest {
@Test @Test
public void UpdateMixTest() throws Exception { public void UpdateMixTest() throws Exception {
SparkResultToOrganizationFromIstRepoJob final String sourcePath = getClass()
.main( .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix")
.getPath();
final String datasourceOrganizationPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization")
.getPath();
final String alreadyLinkedPath = getClass()
.getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked")
.getPath();
SparkResultToOrganizationFromIstRepoJob.main(
new String[] { new String[] {
"-isTest", "-isTest", Boolean.TRUE.toString(),
Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-isSparkSessionManaged", "-sourcePath", sourcePath,
Boolean.FALSE.toString(), "-hive_metastore_uris", "",
"-sourcePath", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software",
getClass() "-saveGraph", "true",
.getResource( "-outputPath", workingDir.toString() + "/relation",
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") "-datasourceOrganizationPath", datasourceOrganizationPath,
.getPath(), "-alreadyLinkedPath", alreadyLinkedPath,
"-hive_metastore_uris",
"",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Software",
"-saveGraph",
"true",
"-outputPath",
workingDir.toString() + "/relation",
"-datasourceOrganizationPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization")
.getPath(),
"-alreadyLinkedPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked")
.getPath(),
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.textFile(workingDir.toString() + "/relation") .textFile(workingDir.toString() + "/relation")

View File

@ -2,17 +2,17 @@
<community id="fet-fp7"> <community id="fet-fp7">
<oacommunity/> <oacommunity/>
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="fet-h2020"> <community id="fet-h2020">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="oa-pg"> <community id="oa-pg">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="ee"> <community id="ee">
@ -35,7 +35,7 @@
<subject>SDG9 - Industry innovation and infrastructure</subject> <subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject> <subject>SDG16 - Peace justice and strong institutions</subject>
</subjects> </subjects>
<datasources/> <providers/>
<zenodocommunities> <zenodocommunities>
<zenodocommunity> <zenodocommunity>
<zenodoid>123</zenodoid> <zenodoid>123</zenodoid>
@ -45,12 +45,12 @@
</community> </community>
<community id="dh-ch"> <community id="dh-ch">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="fam"> <community id="fam">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="ni"> <community id="ni">
@ -74,7 +74,7 @@
<subject>brain magnetic resonance imaging</subject> <subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject> <subject>brain abnormalities</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId> <openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/> <selcriteria/>
@ -95,7 +95,7 @@
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId> <openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="mes"> <community id="mes">
@ -106,12 +106,12 @@
<subject>aqua</subject> <subject>aqua</subject>
<subject>sea</subject> <subject>sea</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId> <openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="aginfra"> <community id="aginfra">
@ -134,7 +134,7 @@
<subject>food distribution</subject> <subject>food distribution</subject>
<subject>forestry</subject> <subject>forestry</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId> <openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/> <selcriteria/>
@ -159,18 +159,18 @@
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId> <openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="clarin"> <community id="clarin">
<oacommunity>oac_clarin</oacommunity> <oacommunity>oac_clarin</oacommunity>
<subjects/> <subjects/>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId> <openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
</communities> </communities>

View File

@ -2,17 +2,17 @@
<community id="fet-fp7"> <community id="fet-fp7">
<oacommunity/> <oacommunity/>
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="fet-h2020"> <community id="fet-h2020">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="oa-pg"> <community id="oa-pg">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="ee"> <community id="ee">
@ -35,7 +35,7 @@
<subject>SDG9 - Industry innovation and infrastructure</subject> <subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice and strong institutions</subject> <subject>SDG16 - Peace justice and strong institutions</subject>
</subjects> </subjects>
<datasources/> <providers/>
<zenodocommunities> <zenodocommunities>
<zenodocommunity> <zenodocommunity>
<zenodoid>123</zenodoid> <zenodoid>123</zenodoid>
@ -45,12 +45,12 @@
</community> </community>
<community id="dh-ch"> <community id="dh-ch">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="fam"> <community id="fam">
<subjects/> <subjects/>
<datasources/> <providers/>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="ni"> <community id="ni">
@ -74,7 +74,7 @@
<subject>brain magnetic resonance imaging</subject> <subject>brain magnetic resonance imaging</subject>
<subject>brain abnormalities</subject> <subject>brain abnormalities</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId> <openaireId>re3data_____::5b9bf9171d92df854cf3c520692e9122</openaireId>
<selcriteria/> <selcriteria/>
@ -95,7 +95,7 @@
<openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId> <openaireId>doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="mes"> <community id="mes">
@ -106,12 +106,12 @@
<subject>aqua</subject> <subject>aqua</subject>
<subject>sea</subject> <subject>sea</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId> <openaireId>re3data_____::9633d1e8c4309c833c2c442abeb0cfeb</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="aginfra"> <community id="aginfra">
@ -134,7 +134,7 @@
<subject>food distribution</subject> <subject>food distribution</subject>
<subject>forestry</subject> <subject>forestry</subject>
</subjects> </subjects>
<datasources> <providers>
<datasource> <datasource>
<openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId> <openaireId>opendoar____::1a551829d50f1400b0dab21fdd969c04</openaireId>
<selcriteria/> <selcriteria/>
@ -159,30 +159,30 @@
<openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId> <openaireId>opendoar____::87ae6fb631f7c8a627e8e28785d9992d</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="clarin"> <community id="clarin">
<oacommunity>oac_clarin</oacommunity> <oacommunity>oac_clarin</oacommunity>
<subjects/> <subjects/>
<datasources> <providers>
<datasource> <datasource>
<openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId> <openaireId>re3data_____::a507cdacc5bbcc08761c92185dee5cab</openaireId>
<selcriteria/> <selcriteria/>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities/> <zenodocommunities/>
</community> </community>
<community id="dariah"> <community id="dariah">
<oacommunity>oaa_dariah</oacommunity> <oacommunity>oaa_dariah</oacommunity>
<subjects/> <subjects/>
<datasources> <providers>
<datasource> <datasource>
<openaireId>openaire____::1cfdb2e14977f31a98e0118283401f32</openaireId> <openaireId>openaire____::1cfdb2e14977f31a98e0118283401f32</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]} <selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"contributor","value":"DARIAH"}]}]}
</selcriteria> </selcriteria>
</datasource> </datasource>
</datasources> </providers>
<zenodocommunities> <zenodocommunities>
<zenodocommunity> <zenodocommunity>
<zenodoid>dimpo</zenodoid> <zenodoid>dimpo</zenodoid>

Some files were not shown because too many files have changed in this diff Show More