dnet-hadoop/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java

225 lines
10 KiB
Java
Raw Normal View History

package eu.dnetlib.dhp.community;
2020-04-21 16:03:51 +02:00
import static eu.dnetlib.dhp.community.TagginConstants.*;
2020-03-03 16:38:50 +01:00
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
2020-04-21 16:03:51 +02:00
import java.io.Serializable;
2020-03-03 16:38:50 +01:00
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
2020-04-21 16:03:51 +02:00
import org.apache.commons.lang3.StringUtils;
2020-03-03 16:38:50 +01:00
2020-04-21 16:03:51 +02:00
/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
2020-03-03 16:38:50 +01:00
private String trust = "0.8";
2020-04-21 16:03:51 +02:00
private boolean clearContext(Result result) {
2020-03-03 16:38:50 +01:00
int tmp = result.getContext().size();
2020-04-21 16:03:51 +02:00
List<Context> clist =
result.getContext().stream()
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
.collect(Collectors.toList());
2020-03-03 16:38:50 +01:00
result.setContext(clist);
return (tmp != clist.size());
}
2020-04-21 16:03:51 +02:00
private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {
Map<String, List<String>> param = new HashMap<>();
String json = new Gson().toJson(result, Result.class);
2020-03-03 16:38:50 +01:00
DocumentContext jsonContext = JsonPath.parse(json);
2020-04-21 16:03:51 +02:00
if (params == null) {
2020-03-03 16:38:50 +01:00
params = new HashMap<>();
}
2020-04-21 16:03:51 +02:00
for (String key : params.keySet()) {
2020-03-03 16:38:50 +01:00
try {
param.put(key, jsonContext.read(params.get(key)));
} catch (com.jayway.jsonpath.PathNotFoundException e) {
param.put(key, new ArrayList<>());
2020-04-21 16:03:51 +02:00
// throw e;
2020-03-03 16:38:50 +01:00
}
}
return param;
}
2020-04-21 16:03:51 +02:00
public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {
2020-03-03 16:38:50 +01:00
2020-04-21 16:03:51 +02:00
// }
// public Result enrichContextCriteria(final Result result, final CommunityConfiguration
// conf, final Map<String,String> criteria) {
2020-03-03 16:38:50 +01:00
final Map<String, List<String>> param = getParamMap(result, criteria);
2020-04-21 16:03:51 +02:00
// Verify if the entity is deletedbyinference. In case verify if to clean the context list
// from all the zenodo communities
if (result.getDataInfo().getDeletedbyinference()) {
clearContext(result);
return result;
2020-03-03 16:38:50 +01:00
}
2020-04-21 16:03:51 +02:00
// communities contains all the communities to be added as context for the result
2020-03-03 16:38:50 +01:00
final Set<String> communities = new HashSet<>();
2020-04-21 16:03:51 +02:00
// tagging for Subject
2020-03-03 16:38:50 +01:00
final Set<String> subjects = new HashSet<>();
2020-04-24 10:47:43 +02:00
Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());
if (oresultsubj.isPresent()) {
oresultsubj.get().stream()
.map(subject -> subject.getValue())
.filter(StringUtils::isNotBlank)
.map(String::toLowerCase)
.map(String::trim)
.collect(Collectors.toCollection(HashSet::new))
.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
}
2020-03-03 16:38:50 +01:00
communities.addAll(subjects);
2020-04-21 16:03:51 +02:00
// Tagging for datasource
2020-03-03 16:38:50 +01:00
final Set<String> datasources = new HashSet<>();
final Set<String> tmp = new HashSet<>();
2020-04-24 10:47:43 +02:00
Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());
if (oresultinstance.isPresent()) {
for (Instance i : oresultinstance.get()) {
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
}
2020-03-03 16:38:50 +01:00
2020-04-24 10:47:43 +02:00
oresultinstance.get().stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new))
.forEach(
dsId ->
datasources.addAll(
conf.getCommunityForDatasource(dsId, param)));
}
2020-03-03 16:38:50 +01:00
communities.addAll(datasources);
/*Tagging for Zenodo Communities*/
final Set<String> czenodo = new HashSet<>();
2020-04-24 10:47:43 +02:00
Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());
if (oresultcontext.isPresent()) {
oresultcontext.get().stream()
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
.collect(Collectors.toList())
.forEach(
c ->
czenodo.addAll(
conf.getCommunityForZenodoCommunityValue(
c.getId()
.substring(
c.getId().lastIndexOf("/") + 1)
.trim())));
}
2020-03-03 16:38:50 +01:00
communities.addAll(czenodo);
clearContext(result);
/*Verify if there is something to bulktag*/
2020-04-21 16:03:51 +02:00
if (communities.isEmpty()) {
2020-03-03 16:38:50 +01:00
return result;
}
2020-04-21 16:03:51 +02:00
result.getContext().stream()
.map(
c -> {
if (communities.contains(c.getId())) {
Optional<List<DataInfo>> opt_dataInfoList =
Optional.ofNullable(c.getDataInfo());
List<DataInfo> dataInfoList;
if (opt_dataInfoList.isPresent())
dataInfoList = opt_dataInfoList.get();
else {
dataInfoList = new ArrayList<>();
c.setDataInfo(dataInfoList);
}
2020-04-21 16:03:51 +02:00
if (subjects.contains(c.getId()))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c.getId()))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c.getId()))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
}
return c;
})
2020-03-03 16:38:50 +01:00
.collect(Collectors.toList());
2020-04-21 16:03:51 +02:00
communities.removeAll(
result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
if (communities.isEmpty()) return result;
List<Context> toaddcontext =
communities.stream()
.map(
c -> {
Context context = new Context();
context.setId(c);
List<DataInfo> dataInfoList = new ArrayList<>();
2020-04-21 16:03:51 +02:00
if (subjects.contains(c))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c))
dataInfoList.add(
getDataInfo(
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_CZENODO,
CLASS_NAME_BULKTAG_ZENODO));
context.setDataInfo(dataInfoList);
return context;
})
.collect(Collectors.toList());
2020-03-03 16:38:50 +01:00
result.getContext().addAll(toaddcontext);
return result;
}
2020-04-21 16:03:51 +02:00
public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name) {
2020-03-03 16:38:50 +01:00
DataInfo di = new DataInfo();
di.setInferred(true);
di.setInferenceprovenance(inference_provenance);
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
return di;
}
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID);
pa.setSchemename(DNET_SCHEMA_NAME);
return pa;
}
2020-04-21 16:03:51 +02:00
}