forked from D-Net/dnet-hadoop
[BulkTagging] modifying code to represent constraints horizontally on all the results. Added subject to the set of field used to express the constraint. Modified resorces to test the new approach. Modified test calss
This commit is contained in:
parent
960cb861a0
commit
55da4d8715
|
@ -6,6 +6,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
|
||||
/** Created by miriam on 01/08/2018. */
|
||||
public class Community implements Serializable {
|
||||
|
@ -14,6 +15,7 @@ public class Community implements Serializable {
|
|||
private List<String> subjects = new ArrayList<>();
|
||||
private List<Provider> providers = new ArrayList<>();
|
||||
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
|
||||
private SelectionConstraints constraints = new SelectionConstraints();
|
||||
|
||||
public String toJson() {
|
||||
final Gson g = new Gson();
|
||||
|
@ -57,4 +59,12 @@ public class Community implements Serializable {
|
|||
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
|
||||
this.zenodoCommunities = zenodoCommunities;
|
||||
}
|
||||
|
||||
public SelectionConstraints getConstraints() {
|
||||
return constraints;
|
||||
}
|
||||
|
||||
public void setConstraints(SelectionConstraints constraints) {
|
||||
this.constraints = constraints;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,8 @@ public class CommunityConfiguration implements Serializable {
|
|||
private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>();
|
||||
// map zenodocommunityid -> communityid
|
||||
private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap = new HashMap<>();
|
||||
// map communityid -> selectionconstraints
|
||||
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
|
||||
|
||||
public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() {
|
||||
return subjectMap;
|
||||
|
@ -51,6 +53,14 @@ public class CommunityConfiguration implements Serializable {
|
|||
this.zenodocommunityMap = zenodocommunityMap;
|
||||
}
|
||||
|
||||
public Map<String, SelectionConstraints> getSelectionConstraintsMap() {
|
||||
return selectionConstraintsMap;
|
||||
}
|
||||
|
||||
public void setSelectionConstraintsMap(Map<String, SelectionConstraints> selectionConstraintsMap) {
|
||||
this.selectionConstraintsMap = selectionConstraintsMap;
|
||||
}
|
||||
|
||||
CommunityConfiguration(final Map<String, Community> communities) {
|
||||
this.communities = communities;
|
||||
init();
|
||||
|
@ -67,6 +77,9 @@ public class CommunityConfiguration implements Serializable {
|
|||
if (zenodocommunityMap == null) {
|
||||
zenodocommunityMap = Maps.newHashMap();
|
||||
}
|
||||
if(selectionConstraintsMap == null){
|
||||
selectionConstraintsMap = Maps.newHashMap();
|
||||
}
|
||||
|
||||
for (Community c : getCommunities().values()) {
|
||||
// get subjects
|
||||
|
@ -87,6 +100,7 @@ public class CommunityConfiguration implements Serializable {
|
|||
new Pair<>(id, zc.getSelCriteria()),
|
||||
zenodocommunityMap);
|
||||
}
|
||||
selectionConstraintsMap.put(id, c.getConstraints());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -85,9 +85,21 @@ public class CommunityConfigurationFactory {
|
|||
c.setSubjects(parseSubjects(node));
|
||||
c.setProviders(parseDatasources(node));
|
||||
c.setZenodoCommunities(parseZenodoCommunities(node));
|
||||
c.setConstraints(parseConstrains(node));
|
||||
return c;
|
||||
}
|
||||
|
||||
private static SelectionConstraints parseConstrains(Node node) {
|
||||
Node aconstraints = node.selectSingleNode("./advancedConstraints");
|
||||
if(aconstraints == null){
|
||||
return null;
|
||||
}
|
||||
SelectionConstraints selectionConstraints = new Gson().fromJson(aconstraints.getText(), SelectionConstraints.class);
|
||||
|
||||
selectionConstraints.setSelection(resolver);
|
||||
return selectionConstraints;
|
||||
}
|
||||
|
||||
private static List<String> parseSubjects(final Node node) {
|
||||
|
||||
final List<String> subjects = Lists.newArrayList();
|
||||
|
|
|
@ -11,6 +11,7 @@ public class Constraint implements Serializable {
|
|||
private String verb;
|
||||
private String field;
|
||||
private String value;
|
||||
// private String element;
|
||||
private Selection selection;
|
||||
|
||||
public String getVerb() {
|
||||
|
@ -50,4 +51,12 @@ public class Constraint implements Serializable {
|
|||
public boolean verifyCriteria(String metadata) {
|
||||
return selection.apply(metadata);
|
||||
}
|
||||
|
||||
// public String getElement() {
|
||||
// return element;
|
||||
// }
|
||||
//
|
||||
// public void setElement(String element) {
|
||||
// this.element = element;
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -9,6 +9,9 @@ import java.util.*;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
@ -95,13 +98,6 @@ public class ResultTagger implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
// result
|
||||
// .getInstance()
|
||||
// .stream()
|
||||
// .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
|
||||
// .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
|
||||
// .map(s -> StringUtils.substringAfter(s, "|"))
|
||||
// .collect(Collectors.toCollection(HashSet::new))
|
||||
tmp
|
||||
.forEach(
|
||||
dsId -> datasources
|
||||
|
@ -135,6 +131,19 @@ public class ResultTagger implements Serializable {
|
|||
|
||||
communities.addAll(czenodo);
|
||||
|
||||
/* Tagging for Advanced Constraints */
|
||||
final Set<String> aconstraints = new HashSet<>();
|
||||
|
||||
conf.getSelectionConstraintsMap().keySet()
|
||||
.forEach(communityId -> {
|
||||
if(conf.getSelectionConstraintsMap().get(communityId) != null &&
|
||||
conf.getSelectionConstraintsMap().get(communityId)
|
||||
.getCriteria().stream().anyMatch(crit -> crit.verifyCriteria(param)))
|
||||
aconstraints.add(communityId);
|
||||
});
|
||||
|
||||
communities.addAll(aconstraints);
|
||||
|
||||
clearContext(result);
|
||||
|
||||
/* Verify if there is something to bulktag */
|
||||
|
@ -152,30 +161,24 @@ public class ResultTagger implements Serializable {
|
|||
dataInfoList = new ArrayList<>();
|
||||
c.setDataInfo(dataInfoList);
|
||||
}
|
||||
if (subjects.contains(c.getId()))
|
||||
if (subjects.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_SUBJECT,
|
||||
CLASS_NAME_BULKTAG_SUBJECT,
|
||||
TAGGING_TRUST));
|
||||
if (datasources.contains(c.getId()))
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (datasources.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_DATASOURCE,
|
||||
CLASS_NAME_BULKTAG_DATASOURCE,
|
||||
TAGGING_TRUST));
|
||||
if (czenodo.contains(c.getId()))
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (czenodo.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_CZENODO,
|
||||
CLASS_NAME_BULKTAG_ZENODO,
|
||||
TAGGING_TRUST));
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (aconstraints.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -195,28 +198,22 @@ public class ResultTagger implements Serializable {
|
|||
List<DataInfo> dataInfoList = new ArrayList<>();
|
||||
if (subjects.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_SUBJECT,
|
||||
CLASS_NAME_BULKTAG_SUBJECT,
|
||||
TAGGING_TRUST));
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (datasources.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_DATASOURCE,
|
||||
CLASS_NAME_BULKTAG_DATASOURCE,
|
||||
TAGGING_TRUST));
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (czenodo.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
getDataInfo(
|
||||
BULKTAG_DATA_INFO_TYPE,
|
||||
CLASS_ID_CZENODO,
|
||||
CLASS_NAME_BULKTAG_ZENODO,
|
||||
TAGGING_TRUST));
|
||||
.add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
if (aconstraints.contains(c))
|
||||
dataInfoList
|
||||
.add(
|
||||
OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
|
||||
|
||||
context.setDataInfo(dataInfoList);
|
||||
return context;
|
||||
})
|
||||
|
@ -226,22 +223,4 @@ public class ResultTagger implements Serializable {
|
|||
return result;
|
||||
}
|
||||
|
||||
public static DataInfo getDataInfo(
|
||||
String inference_provenance, String inference_class_id, String inference_class_name, String trust) {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferred(true);
|
||||
di.setInferenceprovenance(inference_provenance);
|
||||
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
|
||||
di.setTrust(trust);
|
||||
return di;
|
||||
}
|
||||
|
||||
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
|
||||
Qualifier pa = new Qualifier();
|
||||
pa.setClassid(inference_class_id);
|
||||
pa.setClassname(inference_class_name);
|
||||
pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
|
||||
pa.setSchemename(DNET_PROVENANCE_ACTIONS);
|
||||
return pa;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,12 +11,14 @@ public class TaggingConstants {
|
|||
public static final String CLASS_ID_SUBJECT = "community:subject";
|
||||
public static final String CLASS_ID_DATASOURCE = "community:datasource";
|
||||
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
|
||||
public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint";
|
||||
|
||||
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
|
||||
|
||||
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
|
||||
public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
|
||||
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
|
||||
public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints";
|
||||
|
||||
public static final String TAGGING_TRUST = "0.8";
|
||||
}
|
||||
|
|
|
@ -6,7 +6,16 @@ import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY
|
|||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.bulktag.community.ProtoMap;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -24,11 +33,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class BulkTagJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
@ -39,7 +43,8 @@ public class BulkTagJobTest {
|
|||
+ " \"title\" : \"$['title'][*]['value']\","
|
||||
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
|
||||
+ " \"contributor\" : \"$['contributor'][*]['value']\","
|
||||
+ " \"description\" : \"$['description'][*]['value']\"}";
|
||||
+ " \"description\" : \"$['description'][*]['value']\", "
|
||||
+" \"subject\" :\"$['subject'][*]['value']\" }";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
|
@ -763,10 +768,28 @@ public class BulkTagJobTest {
|
|||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
Assertions.assertEquals(3, idExplodeCommunity.count());
|
||||
Assertions.assertEquals(4, idExplodeCommunity.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
|
||||
}
|
||||
|
||||
// @Test
|
||||
// void test1(){
|
||||
// ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
|
||||
// HashMap<String, String> param = new HashMap<>();
|
||||
// for (String key : params.keySet()) {
|
||||
// try {
|
||||
// param.put(key, jsonContext.read(params.get(key)));
|
||||
// } catch (com.jayway.jsonpath.PathNotFoundException e) {
|
||||
// param.put(key, new ArrayList<>());
|
||||
// }
|
||||
// }
|
||||
// return param;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -1193,6 +1193,9 @@
|
|||
<organizations/>
|
||||
</community>
|
||||
<community id="science-innovation-policy">
|
||||
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"},
|
||||
{"verb":"equals","field":"subject","value":"Miriam"}]},
|
||||
{"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]}</advancedConstraints>
|
||||
<subjects>
|
||||
<subject>Sustainability-oriented science policy</subject>
|
||||
<subject> STI policies</subject>
|
||||
|
@ -1316,7 +1319,7 @@
|
|||
<openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue