forked from D-Net/dnet-hadoop
This commit is contained in:
parent
83bb97be83
commit
8dae10b442
|
@ -30,7 +30,7 @@ public class MoveResult implements Serializable {
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
MoveResult.class
|
MoveResult.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/wf/subworkflows/input_moveresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/input_moveresult_parameters.json"));
|
||||||
|
|
||||||
|
|
|
@ -78,5 +78,4 @@ public class QueryCommunityAPI {
|
||||||
return body;
|
return body;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,10 +6,6 @@ import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -24,6 +20,10 @@ import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration;
|
||||||
import eu.dnetlib.dhp.bulktag.community.Provider;
|
import eu.dnetlib.dhp.bulktag.community.Provider;
|
||||||
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
|
||||||
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
|
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
|
@ -57,7 +57,7 @@ public class Utils implements Serializable {
|
||||||
if (d.getEnabled() == null || Boolean.FALSE.equals(d.getEnabled()))
|
if (d.getEnabled() == null || Boolean.FALSE.equals(d.getEnabled()))
|
||||||
return null;
|
return null;
|
||||||
Provider p = new Provider();
|
Provider p = new Provider();
|
||||||
p.setOpenaireId(ModelSupport.getIdPrefix(Datasource.class)+"|" + d.getOpenaireId());
|
p.setOpenaireId(ModelSupport.getIdPrefix(Datasource.class) + "|" + d.getOpenaireId());
|
||||||
p.setSelectionConstraints(d.getSelectioncriteria());
|
p.setSelectionConstraints(d.getSelectioncriteria());
|
||||||
if (p.getSelectionConstraints() != null)
|
if (p.getSelectionConstraints() != null)
|
||||||
p.getSelectionConstraints().setSelection(resolver);
|
p.getSelectionConstraints().setSelection(resolver);
|
||||||
|
@ -175,27 +175,31 @@ public class Utils implements Serializable {
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<EntityCommunities> getDatasourceCommunities(String baseURL)throws IOException{
|
public static List<EntityCommunities> getDatasourceCommunities(String baseURL) throws IOException {
|
||||||
List<CommunityModel> validCommunities = getValidCommunities(baseURL);
|
List<CommunityModel> validCommunities = getValidCommunities(baseURL);
|
||||||
HashMap<String, Set<String>> map = new HashMap<>();
|
HashMap<String, Set<String>> map = new HashMap<>();
|
||||||
String entityPrefix = ModelSupport.getIdPrefix(Datasource.class) + "|" ;
|
String entityPrefix = ModelSupport.getIdPrefix(Datasource.class) + "|";
|
||||||
|
|
||||||
validCommunities.forEach(c -> {
|
validCommunities.forEach(c -> {
|
||||||
try {
|
try {
|
||||||
new ObjectMapper().readValue(QueryCommunityAPI.communityDatasource(c.getId(), baseURL), DatasourceList.class)
|
new ObjectMapper()
|
||||||
.forEach(d -> {
|
.readValue(QueryCommunityAPI.communityDatasource(c.getId(), baseURL), DatasourceList.class)
|
||||||
if (!map.keySet().contains(d.getOpenaireId()))
|
.forEach(d -> {
|
||||||
map.put(d.getOpenaireId(), new HashSet<>());
|
if (!map.keySet().contains(d.getOpenaireId()))
|
||||||
|
map.put(d.getOpenaireId(), new HashSet<>());
|
||||||
|
|
||||||
map.get(d.getOpenaireId()).add(c.getId());
|
map.get(d.getOpenaireId()).add(c.getId());
|
||||||
});
|
});
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
List<EntityCommunities> temp = map.keySet().stream()
|
List<EntityCommunities> temp = map
|
||||||
.map(k -> EntityCommunities.newInstance(entityPrefix + k, getCollect(k, map))).collect(Collectors.toList());
|
.keySet()
|
||||||
|
.stream()
|
||||||
|
.map(k -> EntityCommunities.newInstance(entityPrefix + k, getCollect(k, map)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
return temp;
|
return temp;
|
||||||
|
|
||||||
|
@ -207,5 +211,4 @@ public class Utils implements Serializable {
|
||||||
return temp;
|
return temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,39 +1,40 @@
|
||||||
package eu.dnetlib.dhp.api.model;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
package eu.dnetlib.dhp.api.model;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
* @Date 13/02/24
|
* @Date 13/02/24
|
||||||
*/
|
*/
|
||||||
public class EntityCommunities implements Serializable {
|
public class EntityCommunities implements Serializable {
|
||||||
private String entityId;
|
private String entityId;
|
||||||
private List<String> communitiesId;
|
private List<String> communitiesId;
|
||||||
|
|
||||||
public String getEntityId() {
|
public String getEntityId() {
|
||||||
return entityId;
|
return entityId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEntityId(String entityId) {
|
public void setEntityId(String entityId) {
|
||||||
this.entityId = entityId;
|
this.entityId = entityId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getCommunitiesId() {
|
public List<String> getCommunitiesId() {
|
||||||
return communitiesId;
|
return communitiesId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setCommunitiesId(List<String> communitiesId) {
|
public void setCommunitiesId(List<String> communitiesId) {
|
||||||
this.communitiesId = communitiesId;
|
this.communitiesId = communitiesId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static EntityCommunities newInstance(String dsid, List<String> csid){
|
public static EntityCommunities newInstance(String dsid, List<String> csid) {
|
||||||
EntityCommunities dsc = new EntityCommunities();
|
EntityCommunities dsc = new EntityCommunities();
|
||||||
dsc.entityId = dsid;
|
dsc.entityId = dsid;
|
||||||
dsc.communitiesId = csid;
|
dsc.communitiesId = csid;
|
||||||
return dsc;
|
return dsc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,11 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
|
|
||||||
import eu.dnetlib.dhp.api.model.EntityCommunities;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
@ -27,10 +22,15 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.api.Utils;
|
import eu.dnetlib.dhp.api.Utils;
|
||||||
|
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
|
||||||
|
import eu.dnetlib.dhp.api.model.EntityCommunities;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.bulktag.community.*;
|
import eu.dnetlib.dhp.bulktag.community.*;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -53,6 +53,7 @@ public class SparkBulkTagJob {
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json"));
|
||||||
|
|
||||||
|
log.info(args.toString());
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
@ -71,7 +72,8 @@ public class SparkBulkTagJob {
|
||||||
final String baseURL = parser.get("baseURL");
|
final String baseURL = parser.get("baseURL");
|
||||||
log.info("baseURL: {}", baseURL);
|
log.info("baseURL: {}", baseURL);
|
||||||
|
|
||||||
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap"), ProtoMap.class);
|
log.info("pathMap: {}", parser.get("pathMap"));
|
||||||
|
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("pathMap") + "}}", ProtoMap.class);
|
||||||
log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
|
log.info("pathMap: {}", new Gson().toJson(protoMappingParams));
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
@ -100,89 +102,123 @@ public class SparkBulkTagJob {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath, CommunityEntityMap communityProjects) {
|
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath,
|
||||||
|
CommunityEntityMap communityProjects) {
|
||||||
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class);
|
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class);
|
||||||
Dataset<EntityCommunities> pc = spark.createDataset(communityProjects.keySet().stream().map(k -> EntityCommunities.newInstance(k, communityProjects.get(k))).collect(Collectors.toList()), Encoders.bean(EntityCommunities.class));
|
Dataset<EntityCommunities> pc = spark
|
||||||
|
.createDataset(
|
||||||
|
communityProjects
|
||||||
|
.keySet()
|
||||||
|
.stream()
|
||||||
|
.map(k -> EntityCommunities.newInstance(k, communityProjects.get(k)))
|
||||||
|
.collect(Collectors.toList()),
|
||||||
|
Encoders.bean(EntityCommunities.class));
|
||||||
|
|
||||||
projects.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left")
|
projects
|
||||||
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
|
.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left")
|
||||||
Project ds = t2._1();
|
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
|
||||||
if (t2._2() != null){
|
Project ds = t2._1();
|
||||||
List<String> context =
|
if (t2._2() != null) {
|
||||||
Optional.ofNullable(ds.getContext())
|
List<String> context = Optional
|
||||||
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
|
.ofNullable(ds.getContext())
|
||||||
.orElse(new ArrayList<>());
|
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
|
||||||
|
.orElse(new ArrayList<>());
|
||||||
|
|
||||||
if(!Optional.ofNullable(ds.getContext()).isPresent())
|
if (!Optional.ofNullable(ds.getContext()).isPresent())
|
||||||
ds.setContext(new ArrayList<>());
|
ds.setContext(new ArrayList<>());
|
||||||
t2._2().getCommunitiesId().forEach(c -> {
|
t2._2().getCommunitiesId().forEach(c -> {
|
||||||
if(!context.contains(c)){
|
if (!context.contains(c)) {
|
||||||
Context con = new Context();
|
Context con = new Context();
|
||||||
con.setId(c);
|
con.setId(c);
|
||||||
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
|
con
|
||||||
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1")));
|
.setDataInfo(
|
||||||
ds.getContext().add(con);
|
Arrays
|
||||||
}
|
.asList(
|
||||||
});
|
OafMapperUtils
|
||||||
}
|
.dataInfo(
|
||||||
return ds;
|
false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
|
||||||
} ,Encoders.bean(Project.class))
|
OafMapperUtils
|
||||||
.write()
|
.qualifier(
|
||||||
.mode(SaveMode.Overwrite)
|
TaggingConstants.CLASS_ID_DATASOURCE,
|
||||||
.option("compression","gzip")
|
TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
|
||||||
.json(outputPath + "project");
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"1")));
|
||||||
|
ds.getContext().add(con);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return ds;
|
||||||
|
}, Encoders.bean(Project.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath + "project");
|
||||||
|
|
||||||
readPath(spark, outputPath + "project", Datasource.class)
|
readPath(spark, outputPath + "project", Datasource.class)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression","gzip")
|
.option("compression", "gzip")
|
||||||
.json(inputPath + "project");
|
.json(inputPath + "project");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath,
|
||||||
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath, List<EntityCommunities> datasourceCommunities) {
|
List<EntityCommunities> datasourceCommunities) {
|
||||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "datasource", Datasource.class);
|
Dataset<Datasource> datasource = readPath(spark, inputPath + "datasource", Datasource.class);
|
||||||
|
|
||||||
Dataset<EntityCommunities> dc = spark.createDataset(datasourceCommunities, Encoders.bean(EntityCommunities.class));
|
Dataset<EntityCommunities> dc = spark
|
||||||
|
.createDataset(datasourceCommunities, Encoders.bean(EntityCommunities.class));
|
||||||
|
|
||||||
datasource.joinWith(dc, datasource.col("id").equalTo(dc.col("entityId")), "left")
|
datasource
|
||||||
.map((MapFunction<Tuple2<Datasource, EntityCommunities>, Datasource>) t2 -> {
|
.joinWith(dc, datasource.col("id").equalTo(dc.col("entityId")), "left")
|
||||||
Datasource ds = t2._1();
|
.map((MapFunction<Tuple2<Datasource, EntityCommunities>, Datasource>) t2 -> {
|
||||||
if (t2._2() != null){
|
Datasource ds = t2._1();
|
||||||
|
if (t2._2() != null) {
|
||||||
|
|
||||||
List<String> context =
|
List<String> context = Optional
|
||||||
Optional.ofNullable(ds.getContext())
|
.ofNullable(ds.getContext())
|
||||||
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
|
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
|
||||||
.orElse(new ArrayList<>());
|
.orElse(new ArrayList<>());
|
||||||
|
|
||||||
if(!Optional.ofNullable(ds.getContext()).isPresent())
|
if (!Optional.ofNullable(ds.getContext()).isPresent())
|
||||||
ds.setContext(new ArrayList<>());
|
ds.setContext(new ArrayList<>());
|
||||||
|
|
||||||
|
t2._2().getCommunitiesId().forEach(c -> {
|
||||||
|
if (!context.contains(c)) {
|
||||||
|
Context con = new Context();
|
||||||
|
con.setId(c);
|
||||||
|
con
|
||||||
|
.setDataInfo(
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
TaggingConstants.CLASS_ID_DATASOURCE,
|
||||||
|
TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"1")));
|
||||||
|
ds.getContext().add(con);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return ds;
|
||||||
|
}, Encoders.bean(Datasource.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath + "datasource");
|
||||||
|
|
||||||
t2._2().getCommunitiesId().forEach(c -> {
|
|
||||||
if(!context.contains(c)){
|
|
||||||
Context con = new Context();
|
|
||||||
con.setId(c);
|
|
||||||
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
|
|
||||||
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1")));
|
|
||||||
ds.getContext().add(con);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return ds;
|
|
||||||
} ,Encoders.bean(Datasource.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression","gzip")
|
|
||||||
.json(outputPath + "datasource");
|
|
||||||
|
|
||||||
readPath(spark, outputPath + "datasource", Datasource.class)
|
readPath(spark, outputPath + "datasource", Datasource.class)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression","gzip")
|
.option("compression", "gzip")
|
||||||
.json(inputPath + "datasource");
|
.json(inputPath + "datasource");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void extendCommunityConfigurationForEOSC(SparkSession spark, String inputPath,
|
private static void extendCommunityConfigurationForEOSC(SparkSession spark, String inputPath,
|
||||||
CommunityConfiguration cc) {
|
CommunityConfiguration cc) {
|
||||||
|
|
||||||
|
@ -273,6 +309,4 @@ public class SparkBulkTagJob {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag.actions;
|
package eu.dnetlib.dhp.bulktag.actions;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -8,31 +9,31 @@ import java.util.List;
|
||||||
* @Date 22/01/24
|
* @Date 22/01/24
|
||||||
*/
|
*/
|
||||||
public class Action implements Serializable {
|
public class Action implements Serializable {
|
||||||
private String clazz;
|
private String clazz;
|
||||||
private String method;
|
private String method;
|
||||||
private List<Parameters> params;
|
private List<Parameters> params;
|
||||||
|
|
||||||
public String getClazz() {
|
public String getClazz() {
|
||||||
return clazz;
|
return clazz;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setClazz(String clazz) {
|
public void setClazz(String clazz) {
|
||||||
this.clazz = clazz;
|
this.clazz = clazz;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMethod() {
|
public String getMethod() {
|
||||||
return method;
|
return method;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setMethod(String method) {
|
public void setMethod(String method) {
|
||||||
this.method = method;
|
this.method = method;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Parameters> getParams() {
|
public List<Parameters> getParams() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(List<Parameters> params) {
|
public void setParams(List<Parameters> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag.actions;
|
package eu.dnetlib.dhp.bulktag.actions;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -8,47 +9,47 @@ import java.io.Serializable;
|
||||||
*/
|
*/
|
||||||
public class ExecSubstringAction implements Serializable {
|
public class ExecSubstringAction implements Serializable {
|
||||||
|
|
||||||
private String value;
|
private String value;
|
||||||
private String from;
|
private String from;
|
||||||
private String to;
|
private String to;
|
||||||
|
|
||||||
public String getValue() {
|
public String getValue() {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setValue(String value) {
|
public void setValue(String value) {
|
||||||
this.value = value;
|
this.value = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFrom() {
|
public String getFrom() {
|
||||||
return from;
|
return from;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFrom(String from) {
|
public void setFrom(String from) {
|
||||||
this.from = from;
|
this.from = from;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTo() {
|
public String getTo() {
|
||||||
return to;
|
return to;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTo(String to) {
|
public void setTo(String to) {
|
||||||
this.to = to;
|
this.to = to;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String execSubstring(){
|
public String execSubstring() {
|
||||||
int to = Integer.valueOf(this.to);
|
int to = Integer.valueOf(this.to);
|
||||||
int from = Integer.valueOf(this.from);
|
int from = Integer.valueOf(this.from);
|
||||||
|
|
||||||
if(to < from || from > this.value.length())
|
if (to < from || from > this.value.length())
|
||||||
return "";
|
return "";
|
||||||
|
|
||||||
if(from < 0)
|
if (from < 0)
|
||||||
from = 0;
|
from = 0;
|
||||||
if (to > this.value.length())
|
if (to > this.value.length())
|
||||||
to = this.value.length();
|
to = this.value.length();
|
||||||
|
|
||||||
return this.value.substring(from, to);
|
return this.value.substring(from, to);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag.actions;
|
package eu.dnetlib.dhp.bulktag.actions;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -8,23 +9,22 @@ import java.io.Serializable;
|
||||||
*/
|
*/
|
||||||
public class MapModel implements Serializable {
|
public class MapModel implements Serializable {
|
||||||
|
|
||||||
private String path;
|
private String path;
|
||||||
private Action action;
|
private Action action;
|
||||||
|
|
||||||
|
public String getPath() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
public String getPath() {
|
public void setPath(String path) {
|
||||||
return path;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPath(String path) {
|
public Action getAction() {
|
||||||
this.path = path;
|
return action;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Action getAction() {
|
public void setAction(Action action) {
|
||||||
return action;
|
this.action = action;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAction(Action action) {
|
|
||||||
this.action = action;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag.actions;
|
package eu.dnetlib.dhp.bulktag.actions;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -7,22 +8,22 @@ import java.io.Serializable;
|
||||||
* @Date 22/01/24
|
* @Date 22/01/24
|
||||||
*/
|
*/
|
||||||
public class Parameters implements Serializable {
|
public class Parameters implements Serializable {
|
||||||
private String paramName;
|
private String paramName;
|
||||||
private String paramValue;
|
private String paramValue;
|
||||||
|
|
||||||
public String getParamName() {
|
public String getParamName() {
|
||||||
return paramName;
|
return paramName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParamName(String paramName) {
|
public void setParamName(String paramName) {
|
||||||
this.paramName = paramName;
|
this.paramName = paramName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getParamValue() {
|
public String getParamValue() {
|
||||||
return paramValue;
|
return paramValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParamValue(String paramValue) {
|
public void setParamValue(String paramValue) {
|
||||||
this.paramValue = paramValue;
|
this.paramValue = paramValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.bulktag.community;
|
package eu.dnetlib.dhp.bulktag.community;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.bulktag.actions.MapModel;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.actions.MapModel;
|
||||||
|
|
||||||
public class ProtoMap extends HashMap<String, MapModel> implements Serializable {
|
public class ProtoMap extends HashMap<String, MapModel> implements Serializable {
|
||||||
|
|
||||||
public ProtoMap() {
|
public ProtoMap() {
|
||||||
|
|
|
@ -10,9 +10,6 @@ import java.lang.reflect.Method;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.jayway.jsonpath.PathNotFoundException;
|
|
||||||
import eu.dnetlib.dhp.bulktag.actions.MapModel;
|
|
||||||
import eu.dnetlib.dhp.bulktag.actions.Parameters;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -20,7 +17,10 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.jayway.jsonpath.DocumentContext;
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
import com.jayway.jsonpath.PathNotFoundException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.bulktag.actions.MapModel;
|
||||||
|
import eu.dnetlib.dhp.bulktag.actions.Parameters;
|
||||||
import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag;
|
import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
@ -40,7 +40,8 @@ public class ResultTagger implements Serializable {
|
||||||
return (tmp != clist.size());
|
return (tmp != clist.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
private Map<String, List<String>> getParamMap(final Result result, Map<String, MapModel> params) throws NoSuchMethodException, InvocationTargetException {
|
private Map<String, List<String>> getParamMap(final Result result, Map<String, MapModel> params)
|
||||||
|
throws NoSuchMethodException, InvocationTargetException {
|
||||||
Map<String, List<String>> param = new HashMap<>();
|
Map<String, List<String>> param = new HashMap<>();
|
||||||
String json = new Gson().toJson(result, Result.class);
|
String json = new Gson().toJson(result, Result.class);
|
||||||
DocumentContext jsonContext = JsonPath.parse(json);
|
DocumentContext jsonContext = JsonPath.parse(json);
|
||||||
|
@ -54,30 +55,34 @@ public class ResultTagger implements Serializable {
|
||||||
try {
|
try {
|
||||||
String path = mapModel.getPath();
|
String path = mapModel.getPath();
|
||||||
Object obj = jsonContext.read(path);
|
Object obj = jsonContext.read(path);
|
||||||
List<String> pathValue ;
|
List<String> pathValue;
|
||||||
if(obj instanceof java.lang.String)
|
if (obj instanceof java.lang.String)
|
||||||
pathValue = Arrays.asList((String)obj);
|
pathValue = Arrays.asList((String) obj);
|
||||||
else
|
else
|
||||||
pathValue = (List<String>)obj;
|
pathValue = (List<String>) obj;
|
||||||
if(Optional.ofNullable(mapModel.getAction()).isPresent()){
|
if (Optional.ofNullable(mapModel.getAction()).isPresent()) {
|
||||||
Class<?> c = Class.forName(mapModel.getAction().getClazz());
|
Class<?> c = Class.forName(mapModel.getAction().getClazz());
|
||||||
Object class_instance = c.newInstance();
|
Object class_instance = c.newInstance();
|
||||||
Method setField = c.getMethod("setValue", String.class);
|
Method setField = c.getMethod("setValue", String.class);
|
||||||
setField.invoke(class_instance, pathValue.get(0));
|
setField.invoke(class_instance, pathValue.get(0));
|
||||||
for(Parameters p : mapModel.getAction().getParams()){
|
for (Parameters p : mapModel.getAction().getParams()) {
|
||||||
setField = c.getMethod("set" + p.getParamName(), String.class);
|
setField = c.getMethod("set" + p.getParamName(), String.class);
|
||||||
setField.invoke(class_instance, p.getParamValue());
|
setField.invoke(class_instance, p.getParamValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
param.put(key,Arrays.asList((String)c.getMethod(mapModel.getAction().getMethod()).invoke(class_instance)));
|
param
|
||||||
|
.put(
|
||||||
|
key, Arrays
|
||||||
|
.asList((String) c.getMethod(mapModel.getAction().getMethod()).invoke(class_instance)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
else{
|
else {
|
||||||
param.put(key, pathValue);
|
param.put(key, pathValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (PathNotFoundException | ClassNotFoundException | InstantiationException | IllegalAccessException e) {
|
} catch (PathNotFoundException | ClassNotFoundException | InstantiationException
|
||||||
|
| IllegalAccessException e) {
|
||||||
param.put(key, new ArrayList<>());
|
param.put(key, new ArrayList<>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -86,9 +91,8 @@ public class ResultTagger implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public <R extends Result> R enrichContextCriteria(
|
public <R extends Result> R enrichContextCriteria(
|
||||||
final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria) throws InvocationTargetException, NoSuchMethodException {
|
final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria)
|
||||||
|
throws InvocationTargetException, NoSuchMethodException {
|
||||||
|
|
||||||
|
|
||||||
// Verify if the entity is deletedbyinference. In case verify if to clean the context list
|
// Verify if the entity is deletedbyinference. In case verify if to clean the context list
|
||||||
// from all the zenodo communities
|
// from all the zenodo communities
|
||||||
|
|
|
@ -7,20 +7,23 @@ datasourceWhitelistForCountryPropagation=10|opendoar____::16e6a3326dd7d868cbc926
|
||||||
#allowedtypes=pubsrepository::institutional
|
#allowedtypes=pubsrepository::institutional
|
||||||
allowedtypes=Institutional
|
allowedtypes=Institutional
|
||||||
outputPath=/tmp/miriam/graph/11_graph_orcid
|
outputPath=/tmp/miriam/graph/11_graph_orcid
|
||||||
pathMap ={"author":"$['author'][*]['fullname']", \
|
pathMap ={"author":{"path":"$['author'][*]['fullname']"}, \
|
||||||
"title":"$['title'][*]['value']",\
|
"title":{"path":"$['title'][*]['value']"},\
|
||||||
"orcid":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']" ,\
|
"orcid":{"path":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']"} ,\
|
||||||
"orcid_pending":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']" ,\
|
"orcid_pending":{"path":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']"} ,\
|
||||||
"contributor" : "$['contributor'][*]['value']",\
|
"contributor" : {"path":"$['contributor'][*]['value']"},\
|
||||||
"description" : "$['description'][*]['value']",\
|
"description" : {"path":"$['description'][*]['value']"},\
|
||||||
"subject" :"$['subject'][*]['value']" , \
|
"subject" :{"path":"$['subject'][*]['value']"}, \
|
||||||
"fos" : "$['subject'][?(@['qualifier']['classid']=='FOS')].value" ,\
|
"fos" : {"path":"$['subject'][?(@['qualifier']['classid']=='FOS')].value"} ,\
|
||||||
"sdg" : "$['subject'][?(@['qualifier']['classid']=='SDG')].value",\
|
"sdg" : {"path":"$['subject'][?(@['qualifier']['classid']=='SDG')].value"},\
|
||||||
"journal":"$['journal'].name",\
|
"journal":{"path":"$['journal'].name"},\
|
||||||
"hostedby":"$['instance'][*]['hostedby']['key']",\
|
"hostedby":{"path":"$['instance'][*]['hostedby']['key']"},\
|
||||||
"collectedfrom":"$['instance'][*]['collectedfrom']['key']",\
|
"collectedfrom":{"path":"$['instance'][*]['collectedfrom']['key']"},\
|
||||||
"publisher":"$['publisher'].value",\
|
"publisher":{"path":"$['publisher'].value"},\
|
||||||
"publicationyear":"$['dateofacceptance'].value"}
|
"publicationyear":{"path":"$['dateofacceptance'].value", "action":{"class":"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction",\
|
||||||
|
"method":"execSubstring",\
|
||||||
|
"params":[{"param_name":"From","param_value":0},\
|
||||||
|
{"param_name":"To","param_value":4}]}}}
|
||||||
blacklist=empty
|
blacklist=empty
|
||||||
allowedpids=orcid;orcid_pending
|
allowedpids=orcid;orcid_pending
|
||||||
baseURL = https://services.openaire.eu/openaire/community/
|
baseURL = https://services.openaire.eu/openaire/community/
|
||||||
|
|
|
@ -33,25 +33,25 @@ public class BulkTagJobTest {
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static final String pathMap = "{\"author\":{\"path\":\"$['author'][*]['fullname']\"}," +
|
public static final String pathMap = "{\"author\":{\"path\":\"$['author'][*]['fullname']\"}," +
|
||||||
" \"title\":{\"path\":\"$['title'][*]['value']\"}, "+
|
" \"title\":{\"path\":\"$['title'][*]['value']\"}, " +
|
||||||
" \"orcid\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']\"} , " +
|
" \"orcid\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']\"} , " +
|
||||||
" \"orcid_pending\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']\"} ,"+
|
" \"orcid_pending\":{\"path\":\"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']\"} ,"
|
||||||
"\"contributor\" : {\"path\":\"$['contributor'][*]['value']\"},"+
|
+
|
||||||
" \"description\" : {\"path\":\"$['description'][*]['value']\"},"+
|
"\"contributor\" : {\"path\":\"$['contributor'][*]['value']\"}," +
|
||||||
" \"subject\" :{\"path\":\"$['subject'][*]['value']\"}, " +
|
" \"description\" : {\"path\":\"$['description'][*]['value']\"}," +
|
||||||
" \"fos\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"} , "+
|
" \"subject\" :{\"path\":\"$['subject'][*]['value']\"}, " +
|
||||||
"\"sdg\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"}," +
|
" \"fos\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"} , " +
|
||||||
"\"journal\":{\"path\":\"$['journal'].name\"}," +
|
"\"sdg\" : {\"path\":\"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"}," +
|
||||||
"\"hostedby\":{\"path\":\"$['instance'][*]['hostedby']['key']\"}," +
|
"\"journal\":{\"path\":\"$['journal'].name\"}," +
|
||||||
"\"collectedfrom\":{\"path\":\"$['instance'][*]['collectedfrom']['key']\"}," +
|
"\"hostedby\":{\"path\":\"$['instance'][*]['hostedby']['key']\"}," +
|
||||||
"\"publisher\":{\"path\":\"$['publisher'].value\"}," +
|
"\"collectedfrom\":{\"path\":\"$['instance'][*]['collectedfrom']['key']\"}," +
|
||||||
"\"publicationyear\":{\"path\":\"$['dateofacceptance'].value\", " +
|
"\"publisher\":{\"path\":\"$['publisher'].value\"}," +
|
||||||
" \"action\":{\"clazz\":\"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction\"," +
|
"\"publicationyear\":{\"path\":\"$['dateofacceptance'].value\", " +
|
||||||
"\"method\":\"execSubstring\","+
|
" \"action\":{\"clazz\":\"eu.dnetlib.dhp.bulktag.actions.ExecSubstringAction\"," +
|
||||||
"\"params\":[" +
|
"\"method\":\"execSubstring\"," +
|
||||||
"{\"paramName\":\"From\", \"paramValue\":0}, " +
|
"\"params\":[" +
|
||||||
"{\"paramName\":\"To\",\"paramValue\":4}]}}}";
|
"{\"paramName\":\"From\", \"paramValue\":0}, " +
|
||||||
|
"{\"paramName\":\"To\",\"paramValue\":4}]}}}";
|
||||||
|
|
||||||
private static SparkSession spark;
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
@ -325,7 +325,7 @@ public class BulkTagJobTest {
|
||||||
"-taggingConf", taggingConf,
|
"-taggingConf", taggingConf,
|
||||||
|
|
||||||
"-outputPath", workingDir.toString() + "/",
|
"-outputPath", workingDir.toString() + "/",
|
||||||
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
||||||
"-pathMap", pathMap
|
"-pathMap", pathMap
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -388,38 +388,38 @@ public class BulkTagJobTest {
|
||||||
@Test
|
@Test
|
||||||
void datasourceTag() throws Exception {
|
void datasourceTag() throws Exception {
|
||||||
final String sourcePath = getClass()
|
final String sourcePath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
|
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
|
||||||
.getPath();
|
.getPath();
|
||||||
SparkBulkTagJob
|
SparkBulkTagJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
|
|
||||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
"-sourcePath", sourcePath,
|
"-sourcePath", sourcePath,
|
||||||
"-taggingConf", taggingConf,
|
"-taggingConf", taggingConf,
|
||||||
|
|
||||||
"-outputPath", workingDir.toString() + "/",
|
"-outputPath", workingDir.toString() + "/",
|
||||||
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
||||||
"-pathMap", pathMap
|
"-pathMap", pathMap
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<Datasource> tmp = sc
|
JavaRDD<Datasource> tmp = sc
|
||||||
.textFile(workingDir.toString() + "/datasource")
|
.textFile(workingDir.toString() + "/datasource")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
|
||||||
|
|
||||||
Assertions.assertEquals(3, tmp.count());
|
Assertions.assertEquals(3, tmp.count());
|
||||||
org.apache.spark.sql.Dataset<Datasource> verificationDataset = spark
|
org.apache.spark.sql.Dataset<Datasource> verificationDataset = spark
|
||||||
.createDataset(tmp.rdd(), Encoders.bean(Datasource.class));
|
.createDataset(tmp.rdd(), Encoders.bean(Datasource.class));
|
||||||
|
|
||||||
verificationDataset.createOrReplaceTempView("datasource");
|
verificationDataset.createOrReplaceTempView("datasource");
|
||||||
|
|
||||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||||
+ "from datasource "
|
+ "from datasource "
|
||||||
+ "lateral view explode(context) c as MyT "
|
+ "lateral view explode(context) c as MyT "
|
||||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||||
|
|
||||||
|
@ -427,19 +427,18 @@ public class BulkTagJobTest {
|
||||||
|
|
||||||
Assertions.assertEquals(3, idExplodeCommunity.count());
|
Assertions.assertEquals(3, idExplodeCommunity.count());
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
3,
|
3,
|
||||||
idExplodeCommunity
|
idExplodeCommunity
|
||||||
.filter("name = 'Bulktagging for Community - Datasource'")
|
.filter("name = 'Bulktagging for Community - Datasource'")
|
||||||
.count());
|
.count());
|
||||||
|
|
||||||
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
|
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
|
||||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
|
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1668,60 +1667,94 @@ public class BulkTagJobTest {
|
||||||
Assertions.assertEquals(0, spark.sql(query).count());
|
Assertions.assertEquals(0, spark.sql(query).count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void pubdateTest() throws Exception {
|
void pubdateTest() throws Exception {
|
||||||
|
|
||||||
|
|
||||||
final String pathMap = BulkTagJobTest.pathMap;
|
final String pathMap = BulkTagJobTest.pathMap;
|
||||||
SparkBulkTagJob
|
SparkBulkTagJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/publicationyear/").getPath(),
|
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/publicationyear/").getPath(),
|
||||||
"-taggingConf",
|
"-taggingConf",
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
BulkTagJobTest.class
|
BulkTagJobTest.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_publicationdate.xml")),
|
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_publicationdate.xml")),
|
||||||
"-outputPath", workingDir.toString() + "/",
|
"-outputPath", workingDir.toString() + "/",
|
||||||
"-pathMap", pathMap
|
"-pathMap", pathMap
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<Dataset> tmp = sc
|
JavaRDD<Dataset> tmp = sc
|
||||||
.textFile(workingDir.toString() + "/dataset")
|
.textFile(workingDir.toString() + "/dataset")
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||||
|
|
||||||
Assertions.assertEquals(10, tmp.count());
|
Assertions.assertEquals(10, tmp.count());
|
||||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
||||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||||
|
|
||||||
verificationDataset.createOrReplaceTempView("dataset");
|
verificationDataset.createOrReplaceTempView("dataset");
|
||||||
|
|
||||||
|
|
||||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid "
|
String query = "select id, MyT.id community, MyD.provenanceaction.classid "
|
||||||
+ "from dataset "
|
+ "from dataset "
|
||||||
+ "lateral view explode(context) c as MyT "
|
+ "lateral view explode(context) c as MyT "
|
||||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||||
|
|
||||||
org.apache.spark.sql.Dataset<Row> queryResult = spark.sql(query);
|
org.apache.spark.sql.Dataset<Row> queryResult = spark.sql(query);
|
||||||
queryResult.show(false);
|
queryResult.show(false);
|
||||||
Assertions.assertEquals(5, queryResult.count());
|
Assertions.assertEquals(5, queryResult.count());
|
||||||
|
|
||||||
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529")).count());
|
Assertions
|
||||||
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::2f4f3c820c450bd08dac08d07cc82dcf")).count());
|
.assertEquals(
|
||||||
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::7fcbe3a03280663cddebfd3cb9203177")).count());
|
1,
|
||||||
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::d791339867bec6d3eb2104deeb4e4961")).count());
|
queryResult
|
||||||
Assertions.assertEquals(1, queryResult.filter((FilterFunction<Row>) r -> r.getAs("id").equals("50|od______3989::d90d3a1f64ad264b5ebed8a35b280343")).count());
|
.filter(
|
||||||
|
(FilterFunction<Row>) r -> r
|
||||||
|
.getAs("id")
|
||||||
|
.equals("50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529"))
|
||||||
|
.count());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1,
|
||||||
|
queryResult
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Row>) r -> r
|
||||||
|
.getAs("id")
|
||||||
|
.equals("50|od______3989::2f4f3c820c450bd08dac08d07cc82dcf"))
|
||||||
|
.count());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1,
|
||||||
|
queryResult
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Row>) r -> r
|
||||||
|
.getAs("id")
|
||||||
|
.equals("50|od______3989::7fcbe3a03280663cddebfd3cb9203177"))
|
||||||
|
.count());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1,
|
||||||
|
queryResult
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Row>) r -> r
|
||||||
|
.getAs("id")
|
||||||
|
.equals("50|od______3989::d791339867bec6d3eb2104deeb4e4961"))
|
||||||
|
.count());
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1,
|
||||||
|
queryResult
|
||||||
|
.filter(
|
||||||
|
(FilterFunction<Row>) r -> r
|
||||||
|
.getAs("id")
|
||||||
|
.equals("50|od______3989::d90d3a1f64ad264b5ebed8a35b280343"))
|
||||||
|
.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue