adding context information to projects and datasources #407

Merged
claudio.atzori merged 8 commits from taggingProjects into beta 2024-03-26 14:53:39 +01:00
13 changed files with 388 additions and 308 deletions
Showing only changes of commit 8dae10b442 - Show all commits

View File

@ -30,7 +30,7 @@ public class MoveResult implements Serializable {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
MoveResult.class MoveResult.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/wf/subworkflows/input_moveresult_parameters.json")); "/eu/dnetlib/dhp/wf/subworkflows/input_moveresult_parameters.json"));

View File

@ -78,5 +78,4 @@ public class QueryCommunityAPI {
return body; return body;
} }
} }

View File

@ -6,10 +6,6 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -24,6 +20,10 @@ import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration;
import eu.dnetlib.dhp.bulktag.community.Provider; import eu.dnetlib.dhp.bulktag.community.Provider;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory; import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
@ -57,7 +57,7 @@ public class Utils implements Serializable {
if (d.getEnabled() == null || Boolean.FALSE.equals(d.getEnabled())) if (d.getEnabled() == null || Boolean.FALSE.equals(d.getEnabled()))
return null; return null;
Provider p = new Provider(); Provider p = new Provider();
p.setOpenaireId(ModelSupport.getIdPrefix(Datasource.class)+"|" + d.getOpenaireId()); p.setOpenaireId(ModelSupport.getIdPrefix(Datasource.class) + "|" + d.getOpenaireId());
p.setSelectionConstraints(d.getSelectioncriteria()); p.setSelectionConstraints(d.getSelectioncriteria());
if (p.getSelectionConstraints() != null) if (p.getSelectionConstraints() != null)
p.getSelectionConstraints().setSelection(resolver); p.getSelectionConstraints().setSelection(resolver);
@ -175,27 +175,31 @@ public class Utils implements Serializable {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public static List<EntityCommunities> getDatasourceCommunities(String baseURL)throws IOException{ public static List<EntityCommunities> getDatasourceCommunities(String baseURL) throws IOException {
List<CommunityModel> validCommunities = getValidCommunities(baseURL); List<CommunityModel> validCommunities = getValidCommunities(baseURL);
HashMap<String, Set<String>> map = new HashMap<>(); HashMap<String, Set<String>> map = new HashMap<>();
String entityPrefix = ModelSupport.getIdPrefix(Datasource.class) + "|" ; String entityPrefix = ModelSupport.getIdPrefix(Datasource.class) + "|";
validCommunities.forEach(c -> { validCommunities.forEach(c -> {
try { try {
new ObjectMapper().readValue(QueryCommunityAPI.communityDatasource(c.getId(), baseURL), DatasourceList.class) new ObjectMapper()
.forEach(d -> { .readValue(QueryCommunityAPI.communityDatasource(c.getId(), baseURL), DatasourceList.class)
if (!map.keySet().contains(d.getOpenaireId())) .forEach(d -> {
map.put(d.getOpenaireId(), new HashSet<>()); if (!map.keySet().contains(d.getOpenaireId()))
map.put(d.getOpenaireId(), new HashSet<>());
map.get(d.getOpenaireId()).add(c.getId()); map.get(d.getOpenaireId()).add(c.getId());
}); });
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
}); });
List<EntityCommunities> temp = map.keySet().stream() List<EntityCommunities> temp = map
.map(k -> EntityCommunities.newInstance(entityPrefix + k, getCollect(k, map))).collect(Collectors.toList()); .keySet()
.stream()
.map(k -> EntityCommunities.newInstance(entityPrefix + k, getCollect(k, map)))
.collect(Collectors.toList());
return temp; return temp;
@ -207,5 +211,4 @@ public class Utils implements Serializable {
return temp; return temp;
} }
} }

View File

@ -1,39 +1,40 @@
package eu.dnetlib.dhp.api.model;
import eu.dnetlib.dhp.schema.common.ModelSupport; package eu.dnetlib.dhp.api.model;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Datasource;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 13/02/24 * @Date 13/02/24
*/ */
public class EntityCommunities implements Serializable { public class EntityCommunities implements Serializable {
private String entityId; private String entityId;
private List<String> communitiesId; private List<String> communitiesId;
public String getEntityId() { public String getEntityId() {
return entityId; return entityId;
} }
public void setEntityId(String entityId) { public void setEntityId(String entityId) {
this.entityId = entityId; this.entityId = entityId;
} }
public List<String> getCommunitiesId() { public List<String> getCommunitiesId() {
return communitiesId; return communitiesId;
} }
public void setCommunitiesId(List<String> communitiesId) { public void setCommunitiesId(List<String> communitiesId) {
this.communitiesId = communitiesId; this.communitiesId = communitiesId;
} }
public static EntityCommunities newInstance(String dsid, List<String> csid){ public static EntityCommunities newInstance(String dsid, List<String> csid) {
EntityCommunities dsc = new EntityCommunities(); EntityCommunities dsc = new EntityCommunities();
dsc.entityId = dsid; dsc.entityId = dsid;
dsc.communitiesId = csid; dsc.communitiesId = csid;
return dsc; return dsc;
} }
} }

View File

@ -7,11 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
import eu.dnetlib.dhp.api.model.EntityCommunities;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Project;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -27,10 +22,15 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.api.Utils; import eu.dnetlib.dhp.api.Utils;
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
import eu.dnetlib.dhp.api.model.EntityCommunities;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*; import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -53,6 +53,7 @@ public class SparkBulkTagJob {
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json")); "/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json"));
log.info(args.toString());
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
@ -71,7 +72,8 @@ public class SparkBulkTagJob {
final String baseURL = parser.get("baseURL"); final String baseURL = parser.get("baseURL");
log.info("baseURL: {}", baseURL); log.info("baseURL: {}", baseURL);
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class); log.info("pathMap: {}", parser.get("pathMap"));
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap") + "}}", ProtoMap.class);
log.info("pathMap: {}", new Gson().toJson(protoMappingParams)); log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -100,89 +102,123 @@ public class SparkBulkTagJob {
}); });
} }
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath, CommunityEntityMap communityProjects) { private static void execProjectTag(SparkSession spark, String inputPath, String outputPath,
CommunityEntityMap communityProjects) {
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class); Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class);
Dataset<EntityCommunities> pc = spark.createDataset(communityProjects.keySet().stream().map(k -> EntityCommunities.newInstance(k, communityProjects.get(k))).collect(Collectors.toList()), Encoders.bean(EntityCommunities.class)); Dataset<EntityCommunities> pc = spark
.createDataset(
communityProjects
.keySet()
.stream()
.map(k -> EntityCommunities.newInstance(k, communityProjects.get(k)))
.collect(Collectors.toList()),
Encoders.bean(EntityCommunities.class));
projects.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left") projects
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> { .joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left")
Project ds = t2._1(); .map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
if (t2._2() != null){ Project ds = t2._1();
List<String> context = if (t2._2() != null) {
Optional.ofNullable(ds.getContext()) List<String> context = Optional
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList())) .ofNullable(ds.getContext())
.orElse(new ArrayList<>()); .map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
.orElse(new ArrayList<>());
if(!Optional.ofNullable(ds.getContext()).isPresent()) if (!Optional.ofNullable(ds.getContext()).isPresent())
ds.setContext(new ArrayList<>()); ds.setContext(new ArrayList<>());
t2._2().getCommunitiesId().forEach(c -> { t2._2().getCommunitiesId().forEach(c -> {
if(!context.contains(c)){ if (!context.contains(c)) {
Context con = new Context(); Context con = new Context();
con.setId(c); con.setId(c);
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false, con
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1"))); .setDataInfo(
ds.getContext().add(con); Arrays
} .asList(
}); OafMapperUtils
} .dataInfo(
return ds; false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
} ,Encoders.bean(Project.class)) OafMapperUtils
.write() .qualifier(
.mode(SaveMode.Overwrite) TaggingConstants.CLASS_ID_DATASOURCE,
.option("compression","gzip") TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
.json(outputPath + "project"); ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"1")));
ds.getContext().add(con);
}
});
}
return ds;
}, Encoders.bean(Project.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "project");
readPath(spark, outputPath + "project", Datasource.class) readPath(spark, outputPath + "project", Datasource.class)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression","gzip") .option("compression", "gzip")
.json(inputPath + "project"); .json(inputPath + "project");
} }
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath,
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath, List<EntityCommunities> datasourceCommunities) { List<EntityCommunities> datasourceCommunities) {
Dataset<Datasource> datasource = readPath(spark, inputPath + "datasource", Datasource.class); Dataset<Datasource> datasource = readPath(spark, inputPath + "datasource", Datasource.class);
Dataset<EntityCommunities> dc = spark.createDataset(datasourceCommunities, Encoders.bean(EntityCommunities.class)); Dataset<EntityCommunities> dc = spark
.createDataset(datasourceCommunities, Encoders.bean(EntityCommunities.class));
datasource.joinWith(dc, datasource.col("id").equalTo(dc.col("entityId")), "left") datasource
.map((MapFunction<Tuple2<Datasource, EntityCommunities>, Datasource>) t2 -> { .joinWith(dc, datasource.col("id").equalTo(dc.col("entityId")), "left")
Datasource ds = t2._1(); .map((MapFunction<Tuple2<Datasource, EntityCommunities>, Datasource>) t2 -> {
if (t2._2() != null){ Datasource ds = t2._1();
if (t2._2() != null) {
List<String> context = List<String> context = Optional
Optional.ofNullable(ds.getContext()) .ofNullable(ds.getContext())
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList())) .map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
.orElse(new ArrayList<>()); .orElse(new ArrayList<>());
if(!Optional.ofNullable(ds.getContext()).isPresent()) if (!Optional.ofNullable(ds.getContext()).isPresent())
ds.setContext(new ArrayList<>()); ds.setContext(new ArrayList<>());
t2._2().getCommunitiesId().forEach(c -> { t2._2().getCommunitiesId().forEach(c -> {
if(!context.contains(c)){ if (!context.contains(c)) {
Context con = new Context(); Context con = new Context();
con.setId(c); con.setId(c);
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false, con
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1"))); .setDataInfo(
ds.getContext().add(con); Arrays
} .asList(
}); OafMapperUtils
} .dataInfo(
return ds; false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
} ,Encoders.bean(Datasource.class)) OafMapperUtils
.write() .qualifier(
.mode(SaveMode.Overwrite) TaggingConstants.CLASS_ID_DATASOURCE,
.option("compression","gzip") TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
.json(outputPath + "datasource"); ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"1")));
ds.getContext().add(con);
}
});
}
return ds;
}, Encoders.bean(Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "datasource");
readPath(spark, outputPath + "datasource", Datasource.class) readPath(spark, outputPath + "datasource", Datasource.class)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression","gzip") .option("compression", "gzip")
.json(inputPath + "datasource"); .json(inputPath + "datasource");
} }
private static void extendCommunityConfigurationForEOSC(SparkSession spark, String inputPath, private static void extendCommunityConfigurationForEOSC(SparkSession spark, String inputPath,
CommunityConfiguration cc) { CommunityConfiguration cc) {
@ -273,6 +309,4 @@ public class SparkBulkTagJob {
}; };
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.bulktag.actions; package eu.dnetlib.dhp.bulktag.actions;
import java.io.Serializable; import java.io.Serializable;
@ -8,31 +9,31 @@ import java.util.List;
* @Date 22/01/24 * @Date 22/01/24
*/ */
public class Action implements Serializable { public class Action implements Serializable {
private String clazz; private String clazz;
private String method; private String method;
private List<Parameters> params; private List<Parameters> params;
public String getClazz() { public String getClazz() {
return clazz; return clazz;
} }
public void setClazz(String clazz) { public void setClazz(String clazz) {
this.clazz = clazz; this.clazz = clazz;
} }
public String getMethod() { public String getMethod() {
return method; return method;
} }
public void setMethod(String method) { public void setMethod(String method) {
this.method = method; this.method = method;
} }
public List<Parameters> getParams() { public List<Parameters> getParams() {
return params; return params;
} }
public void setParams(List<Parameters> params) { public void setParams(List<Parameters> params) {
this.params = params; this.params = params;
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.bulktag.actions; package eu.dnetlib.dhp.bulktag.actions;
import java.io.Serializable; import java.io.Serializable;
@ -8,47 +9,47 @@ import java.io.Serializable;
*/ */
public class ExecSubstringAction implements Serializable { public class ExecSubstringAction implements Serializable {
private String value; private String value;
private String from; private String from;
private String to; private String to;
public String getValue() { public String getValue() {
return value; return value;
} }
public void setValue(String value) { public void setValue(String value) {
this.value = value; this.value = value;
} }
public String getFrom() { public String getFrom() {
return from; return from;
} }
public void setFrom(String from) { public void setFrom(String from) {
this.from = from; this.from = from;
} }
public String getTo() { public String getTo() {
return to; return to;
} }
public void setTo(String to) { public void setTo(String to) {
this.to = to; this.to = to;
} }
public String execSubstring(){ public String execSubstring() {
int to = Integer.valueOf(this.to); int to = Integer.valueOf(this.to);
int from = Integer.valueOf(this.from); int from = Integer.valueOf(this.from);
if(to < from || from > this.value.length()) if (to < from || from > this.value.length())
return ""; return "";
if(from < 0) if (from < 0)
from = 0; from = 0;
if (to > this.value.length()) if (to > this.value.length())
to = this.value.length(); to = this.value.length();
return this.value.substring(from, to); return this.value.substring(from, to);
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.bulktag.actions; package eu.dnetlib.dhp.bulktag.actions;
import java.io.Serializable; import java.io.Serializable;
@ -8,23 +9,22 @@ import java.io.Serializable;
*/ */
public class MapModel implements Serializable { public class MapModel implements Serializable {
private String path; private String path;
private Action action; private Action action;
public String getPath() {
return path;
}
public String getPath() { public void setPath(String path) {
return path; this.path = path;
} }
public void setPath(String path) { public Action getAction() {
this.path = path; return action;
} }
public Action getAction() { public void setAction(Action action) {
return action; this.action = action;
} }
public void setAction(Action action) {
this.action = action;
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.bulktag.actions; package eu.dnetlib.dhp.bulktag.actions;
import java.io.Serializable; import java.io.Serializable;
@ -7,22 +8,22 @@ import java.io.Serializable;
* @Date 22/01/24 * @Date 22/01/24
*/ */
public class Parameters implements Serializable { public class Parameters implements Serializable {
private String paramName; private String paramName;
private String paramValue; private String paramValue;
public String getParamName() { public String getParamName() {
return paramName; return paramName;
} }
public void setParamName(String paramName) { public void setParamName(String paramName) {
this.paramName = paramName; this.paramName = paramName;
} }
public String getParamValue() { public String getParamValue() {
return paramValue; return paramValue;
} }
public void setParamValue(String paramValue) { public void setParamValue(String paramValue) {
this.paramValue = paramValue; this.paramValue = paramValue;
} }
} }

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.bulktag.community; package eu.dnetlib.dhp.bulktag.community;
import eu.dnetlib.dhp.bulktag.actions.MapModel;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import eu.dnetlib.dhp.bulktag.actions.MapModel;
public class ProtoMap extends HashMap<String, MapModel> implements Serializable { public class ProtoMap extends HashMap<String, MapModel> implements Serializable {
public ProtoMap() { public ProtoMap() {

View File

@ -10,9 +10,6 @@ import java.lang.reflect.Method;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.jayway.jsonpath.PathNotFoundException;
import eu.dnetlib.dhp.bulktag.actions.MapModel;
import eu.dnetlib.dhp.bulktag.actions.Parameters;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -20,7 +17,10 @@ import org.slf4j.LoggerFactory;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;
import eu.dnetlib.dhp.bulktag.actions.MapModel;
import eu.dnetlib.dhp.bulktag.actions.Parameters;
import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag; import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -40,7 +40,8 @@ public class ResultTagger implements Serializable {
return (tmp != clist.size()); return (tmp != clist.size());
} }
private Map<String, List<String>> getParamMap(final Result result, Map<String, MapModel> params) throws NoSuchMethodException, InvocationTargetException { private Map<String, List<String>> getParamMap(final Result result, Map<String, MapModel> params)
throws NoSuchMethodException, InvocationTargetException {
Map<String, List<String>> param = new HashMap<>(); Map<String, List<String>> param = new HashMap<>();
String json = new Gson().toJson(result, Result.class); String json = new Gson().toJson(result, Result.class);
DocumentContext jsonContext = JsonPath.parse(json); DocumentContext jsonContext = JsonPath.parse(json);
@ -54,30 +55,34 @@ public class ResultTagger implements Serializable {
try { try {
String path = mapModel.getPath(); String path = mapModel.getPath();
Object obj = jsonContext.read(path); Object obj = jsonContext.read(path);
List<String> pathValue ; List<String> pathValue;
if(obj instanceof java.lang.String) if (obj instanceof java.lang.String)
pathValue = Arrays.asList((String)obj); pathValue = Arrays.asList((String) obj);
else else
pathValue = (List<String>)obj; pathValue = (List<String>) obj;
if(Optional.ofNullable(mapModel.getAction()).isPresent()){ if (Optional.ofNullable(mapModel.getAction()).isPresent()) {
Class<?> c = Class.forName(mapModel.getAction().getClazz()); Class<?> c = Class.forName(mapModel.getAction().getClazz());
Object class_instance = c.newInstance(); Object class_instance = c.newInstance();
Method setField = c.getMethod("setValue", String.class); Method setField = c.getMethod("setValue", String.class);
setField.invoke(class_instance, pathValue.get(0)); setField.invoke(class_instance, pathValue.get(0));
for(Parameters p : mapModel.getAction().getParams()){ for (Parameters p : mapModel.getAction().getParams()) {
setField = c.getMethod("set" + p.getParamName(), String.class); setField = c.getMethod("set" + p.getParamName(), String.class);
setField.invoke(class_instance, p.getParamValue()); setField.invoke(class_instance, p.getParamValue());
} }
param.put(key,Arrays.asList((String)c.getMethod(mapModel.getAction().getMethod()).invoke(class_instance))); param
.put(
key, Arrays
.asList((String) c.getMethod(mapModel.getAction().getMethod()).invoke(class_instance)));
} }
else{ else {
param.put(key, pathValue); param.put(key, pathValue);
} }
} catch (PathNotFoundException | ClassNotFoundException | InstantiationException | IllegalAccessException e) { } catch (PathNotFoundException | ClassNotFoundException | InstantiationException
| IllegalAccessException e) {
param.put(key, new ArrayList<>()); param.put(key, new ArrayList<>());
} }
} }
@ -86,9 +91,8 @@ public class ResultTagger implements Serializable {
} }
public <R extends Result> R enrichContextCriteria( public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria) throws InvocationTargetException, NoSuchMethodException { final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria)
throws InvocationTargetException, NoSuchMethodException {
// Verify if the entity is deletedbyinference. In case verify if to clean the context list // Verify if the entity is deletedbyinference. In case verify if to clean the context list
// from all the zenodo communities // from all the zenodo communities

View File

@ -7,20 +7,23 @@ datasourceWhitelistForCountryPropagation=10|opendoar____::16e6a3326dd7d868cbc926
#allowedtypes=pubsrepository::institutional #allowedtypes=pubsrepository::institutional
allowedtypes=Institutional allowedtypes=Institutional
outputPath=/tmp/miriam/graph/11_graph_orcid outputPath=/tmp/miriam/graph/11_graph_orcid
pathMap ={"author":"$['author'][*]['fullname']", \ pathMap ={"author":{"path":"$['author'][*]['fullname']"}, \
"title":"$['title'][*]['value']",\ "title":{"path":"$['title'][*]['value']"},\
"orcid":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']" ,\ "orcid":{"path":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']"} ,\
"orcid_pending":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']" ,\ "orcid_pending":{"path":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']"} ,\
"contributor" : "$['contributor'][*]['value']",\ "contributor" : {"path":"$['contributor'][*]['value']"},\
"description" : "$['description'][*]['value']",\ "description" : {"path":"$['description'][*]['value']"},\
"subject" :"$['subject'][*]['value']" , \ "subject" :{"path":"$['subject'][*]['value']"}, \
"fos" : "$['subject'][?(@['qualifier']['classid']=='FOS')].value" ,\ "fos" : {"path":"$['subject'][?(@['qualifier']['classid']=='FOS')].value"} ,\
"sdg" : "$['subject'][?(@['qualifier']['classid']=='SDG')].value",\ "sdg" : {"path":"$['subject'][?(@['qualifier']['classid']=='SDG')].value"},\
"journal":"$['journal'].name",\ "journal":{"path":"$['journal'].name"},\
"hostedby":"$['instance'][*]['hostedby']['key']",\ "hostedby":{"path":"$['instance'][*]['hostedby']['key']"},\
"collectedfrom":"$['instance'][*]['collectedfrom']['key']",\ "collectedfrom":{"path":"$['instance'][*]['collectedfrom']['key']"},\
"publisher":"$['publisher'].value",\ "publisher":{"path":"$['publisher'].value"},\
"publicationyear":"$['dateofacceptance'].value"} "publicationyear":{"path":"$['dateofacceptance'].value", "action":{"class":"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction",\
"method":"execSubstring",\
"params":[{"param_name":"From","param_value":0},\
{"param_name":"To","param_value":4}]}}}
blacklist=empty blacklist=empty
allowedpids=orcid;orcid_pending allowedpids=orcid;orcid_pending
baseURL = https://services.openaire.eu/openaire/community/ baseURL = https://services.openaire.eu/openaire/community/

View File

@ -33,25 +33,25 @@ public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String pathMap = "{\"author\":{\"path\":\"$['author'][*]['fullname']\"}," + public static final String pathMap = "{\"author\":{\"path\":\"$['author'][*]['fullname']\"}," +
" \"title\":{\"path\":\"$['title'][*]['value']\"}, "+ " \"title\":{\"path\":\"$['title'][*]['value']\"}, " +
" \"orcid\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']\"} , " + " \"orcid\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']\"} , " +
" \"orcid_pending\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']\"} ,"+ " \"orcid_pending\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']\"} ,"
"\"contributor\" : {\"path\":\"$['contributor'][*]['value']\"},"+ +
" \"description\" : {\"path\":\"$['description'][*]['value']\"},"+ "\"contributor\" : {\"path\":\"$['contributor'][*]['value']\"}," +
" \"subject\" :{\"path\":\"$['subject'][*]['value']\"}, " + " \"description\" : {\"path\":\"$['description'][*]['value']\"}," +
" \"fos\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"} , "+ " \"subject\" :{\"path\":\"$['subject'][*]['value']\"}, " +
"\"sdg\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"}," + " \"fos\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"} , " +
"\"journal\":{\"path\":\"$['journal'].name\"}," + "\"sdg\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"}," +
"\"hostedby\":{\"path\":\"$['instance'][*]['hostedby']['key']\"}," + "\"journal\":{\"path\":\"$['journal'].name\"}," +
"\"collectedfrom\":{\"path\":\"$['instance'][*]['collectedfrom']['key']\"}," + "\"hostedby\":{\"path\":\"$['instance'][*]['hostedby']['key']\"}," +
"\"publisher\":{\"path\":\"$['publisher'].value\"}," + "\"collectedfrom\":{\"path\":\"$['instance'][*]['collectedfrom']['key']\"}," +
"\"publicationyear\":{\"path\":\"$['dateofacceptance'].value\", " + "\"publisher\":{\"path\":\"$['publisher'].value\"}," +
" \"action\":{\"clazz\":\"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction\"," + "\"publicationyear\":{\"path\":\"$['dateofacceptance'].value\", " +
"\"method\":\"execSubstring\","+ " \"action\":{\"clazz\":\"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction\"," +
"\"params\":[" + "\"method\":\"execSubstring\"," +
"{\"paramName\":\"From\", \"paramValue\":0}, " + "\"params\":[" +
"{\"paramName\":\"To\",\"paramValue\":4}]}}}"; "{\"paramName\":\"From\", \"paramValue\":0}, " +
"{\"paramName\":\"To\",\"paramValue\":4}]}}}";
private static SparkSession spark; private static SparkSession spark;
@ -325,7 +325,7 @@ public class BulkTagJobTest {
"-taggingConf", taggingConf, "-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/", "-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/", "-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", pathMap "-pathMap", pathMap
}); });
@ -388,38 +388,38 @@ public class BulkTagJobTest {
@Test @Test
void datasourceTag() throws Exception { void datasourceTag() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/") .getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath(); .getPath();
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath, "-sourcePath", sourcePath,
"-taggingConf", taggingConf, "-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/", "-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/", "-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", pathMap "-pathMap", pathMap
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Datasource> tmp = sc JavaRDD<Datasource> tmp = sc
.textFile(workingDir.toString() + "/datasource") .textFile(workingDir.toString() + "/datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)); .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
Assertions.assertEquals(3, tmp.count()); Assertions.assertEquals(3, tmp.count());
org.apache.spark.sql.Dataset<Datasource> verificationDataset = spark org.apache.spark.sql.Dataset<Datasource> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Datasource.class)); .createDataset(tmp.rdd(), Encoders.bean(Datasource.class));
verificationDataset.createOrReplaceTempView("datasource"); verificationDataset.createOrReplaceTempView("datasource");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from datasource " + "from datasource "
+ "lateral view explode(context) c as MyT " + "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD " + "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'"; + "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query); org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
@ -427,19 +427,18 @@ public class BulkTagJobTest {
Assertions.assertEquals(3, idExplodeCommunity.count()); Assertions.assertEquals(3, idExplodeCommunity.count());
Assertions Assertions
.assertEquals( .assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions Assertions
.assertEquals( .assertEquals(
3, 3,
idExplodeCommunity idExplodeCommunity
.filter("name = 'Bulktagging for Community - Datasource'") .filter("name = 'Bulktagging for Community - Datasource'")
.count()); .count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count()); Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count()); Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
} }
@Test @Test
@ -1668,60 +1667,94 @@ public class BulkTagJobTest {
Assertions.assertEquals(0, spark.sql(query).count()); Assertions.assertEquals(0, spark.sql(query).count());
} }
@Test @Test
void pubdateTest() throws Exception { void pubdateTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap; final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob SparkBulkTagJob
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", "-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/publicationyear/").getPath(), getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/publicationyear/").getPath(),
"-taggingConf", "-taggingConf",
IOUtils IOUtils
.toString( .toString(
BulkTagJobTest.class BulkTagJobTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_publicationdate.xml")), "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_publicationdate.xml")),
"-outputPath", workingDir.toString() + "/", "-outputPath", workingDir.toString() + "/",
"-pathMap", pathMap "-pathMap", pathMap
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset") .textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count()); Assertions.assertEquals(10, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); .createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset"); verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid " String query = "select id, MyT.id community, MyD.provenanceaction.classid "
+ "from dataset " + "from dataset "
+ "lateral view explode(context) c as MyT " + "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD " + "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'"; + "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> queryResult = spark.sql(query); org.apache.spark.sql.Dataset<Row> queryResult = spark.sql(query);
queryResult.show(false); queryResult.show(false);
Assertions.assertEquals(5, queryResult.count()); Assertions.assertEquals(5, queryResult.count());
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529")).count()); Assertions
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::2f4f3c820c450bd08dac08d07cc82dcf")).count()); .assertEquals(
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::7fcbe3a03280663cddebfd3cb9203177")).count()); 1,
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::d791339867bec6d3eb2104deeb4e4961")).count()); queryResult
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::d90d3a1f64ad264b5ebed8a35b280343")).count()); .filter(
(FilterFunction<Row>) r -> r
.getAs("id")
.equals("50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529"))
.count());
Assertions
.assertEquals(
1,
queryResult
.filter(
(FilterFunction<Row>) r -> r
.getAs("id")
.equals("50|od______3989::2f4f3c820c450bd08dac08d07cc82dcf"))
.count());
Assertions
.assertEquals(
1,
queryResult
.filter(
(FilterFunction<Row>) r -> r
.getAs("id")
.equals("50|od______3989::7fcbe3a03280663cddebfd3cb9203177"))
.count());
Assertions
.assertEquals(
1,
queryResult
.filter(
(FilterFunction<Row>) r -> r
.getAs("id")
.equals("50|od______3989::d791339867bec6d3eb2104deeb4e4961"))
.count());
Assertions
.assertEquals(
1,
queryResult
.filter(
(FilterFunction<Row>) r -> r
.getAs("id")
.equals("50|od______3989::d90d3a1f64ad264b5ebed8a35b280343"))
.count());
} }
} }