[BulkTagging] modifying code to represent constraints horizontally on all the results. Added subject to the set of field used to express the constraint. Modified resorces to test the new approach. Modified test calss

This commit is contained in:
Miriam Baglioni 2022-09-23 16:02:19 +02:00
parent 960cb861a0
commit 55da4d8715
10 changed files with 134 additions and 72 deletions

View File

@ -6,6 +6,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.google.gson.Gson; import com.google.gson.Gson;
import org.apache.avro.generic.GenericData;
/** Created by miriam on 01/08/2018. */ /** Created by miriam on 01/08/2018. */
public class Community implements Serializable { public class Community implements Serializable {
@ -14,6 +15,7 @@ public class Community implements Serializable {
private List<String> subjects = new ArrayList<>(); private List<String> subjects = new ArrayList<>();
private List<Provider> providers = new ArrayList<>(); private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
private SelectionConstraints constraints = new SelectionConstraints();
public String toJson() { public String toJson() {
final Gson g = new Gson(); final Gson g = new Gson();
@ -57,4 +59,12 @@ public class Community implements Serializable {
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) { public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
this.zenodoCommunities = zenodoCommunities; this.zenodoCommunities = zenodoCommunities;
} }
public SelectionConstraints getConstraints() {
return constraints;
}
public void setConstraints(SelectionConstraints constraints) {
this.constraints = constraints;
}
} }

View File

@ -24,6 +24,8 @@ public class CommunityConfiguration implements Serializable {
private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>(); private Map<String, List<Pair<String, SelectionConstraints>>> datasourceMap = new HashMap<>();
// map zenodocommunityid -> communityid // map zenodocommunityid -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap = new HashMap<>(); private Map<String, List<Pair<String, SelectionConstraints>>> zenodocommunityMap = new HashMap<>();
// map communityid -> selectionconstraints
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() { public Map<String, List<Pair<String, SelectionConstraints>>> getSubjectMap() {
return subjectMap; return subjectMap;
@ -51,6 +53,14 @@ public class CommunityConfiguration implements Serializable {
this.zenodocommunityMap = zenodocommunityMap; this.zenodocommunityMap = zenodocommunityMap;
} }
public Map<String, SelectionConstraints> getSelectionConstraintsMap() {
return selectionConstraintsMap;
}
public void setSelectionConstraintsMap(Map<String, SelectionConstraints> selectionConstraintsMap) {
this.selectionConstraintsMap = selectionConstraintsMap;
}
CommunityConfiguration(final Map<String, Community> communities) { CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities; this.communities = communities;
init(); init();
@ -67,6 +77,9 @@ public class CommunityConfiguration implements Serializable {
if (zenodocommunityMap == null) { if (zenodocommunityMap == null) {
zenodocommunityMap = Maps.newHashMap(); zenodocommunityMap = Maps.newHashMap();
} }
if(selectionConstraintsMap == null){
selectionConstraintsMap = Maps.newHashMap();
}
for (Community c : getCommunities().values()) { for (Community c : getCommunities().values()) {
// get subjects // get subjects
@ -87,6 +100,7 @@ public class CommunityConfiguration implements Serializable {
new Pair<>(id, zc.getSelCriteria()), new Pair<>(id, zc.getSelCriteria()),
zenodocommunityMap); zenodocommunityMap);
} }
selectionConstraintsMap.put(id, c.getConstraints());
} }
} }

View File

@ -85,9 +85,21 @@ public class CommunityConfigurationFactory {
c.setSubjects(parseSubjects(node)); c.setSubjects(parseSubjects(node));
c.setProviders(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
c.setConstraints(parseConstrains(node));
return c; return c;
} }
private static SelectionConstraints parseConstrains(Node node) {
Node aconstraints = node.selectSingleNode("./advancedConstraints");
if(aconstraints == null){
return null;
}
SelectionConstraints selectionConstraints = new Gson().fromJson(aconstraints.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
return selectionConstraints;
}
private static List<String> parseSubjects(final Node node) { private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList(); final List<String> subjects = Lists.newArrayList();

View File

@ -11,6 +11,7 @@ public class Constraint implements Serializable {
private String verb; private String verb;
private String field; private String field;
private String value; private String value;
// private String element;
private Selection selection; private Selection selection;
public String getVerb() { public String getVerb() {
@ -50,4 +51,12 @@ public class Constraint implements Serializable {
public boolean verifyCriteria(String metadata) { public boolean verifyCriteria(String metadata) {
return selection.apply(metadata); return selection.apply(metadata);
} }
// public String getElement() {
// return element;
// }
//
// public void setElement(String element) {
// this.element = element;
// }
} }

View File

@ -9,6 +9,9 @@ import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.gson.Gson; import com.google.gson.Gson;
@ -95,13 +98,6 @@ public class ResultTagger implements Serializable {
} }
// result
// .getInstance()
// .stream()
// .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
// .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
// .map(s -> StringUtils.substringAfter(s, "|"))
// .collect(Collectors.toCollection(HashSet::new))
tmp tmp
.forEach( .forEach(
dsId -> datasources dsId -> datasources
@ -135,6 +131,19 @@ public class ResultTagger implements Serializable {
communities.addAll(czenodo); communities.addAll(czenodo);
/* Tagging for Advanced Constraints */
final Set<String> aconstraints = new HashSet<>();
conf.getSelectionConstraintsMap().keySet()
.forEach(communityId -> {
if(conf.getSelectionConstraintsMap().get(communityId) != null &&
conf.getSelectionConstraintsMap().get(communityId)
.getCriteria().stream().anyMatch(crit -> crit.verifyCriteria(param)))
aconstraints.add(communityId);
});
communities.addAll(aconstraints);
clearContext(result); clearContext(result);
/* Verify if there is something to bulktag */ /* Verify if there is something to bulktag */
@ -152,30 +161,24 @@ public class ResultTagger implements Serializable {
dataInfoList = new ArrayList<>(); dataInfoList = new ArrayList<>();
c.setDataInfo(dataInfoList); c.setDataInfo(dataInfoList);
} }
if (subjects.contains(c.getId())) if (subjects.contains(c))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE, if (datasources.contains(c))
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT,
TAGGING_TRUST));
if (datasources.contains(c.getId()))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE, if (czenodo.contains(c))
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE,
TAGGING_TRUST));
if (czenodo.contains(c.getId()))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE, if (aconstraints.contains(c))
CLASS_ID_CZENODO, dataInfoList
CLASS_NAME_BULKTAG_ZENODO, .add(
TAGGING_TRUST)); OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
} }
}); });
@ -195,28 +198,22 @@ public class ResultTagger implements Serializable {
List<DataInfo> dataInfoList = new ArrayList<>(); List<DataInfo> dataInfoList = new ArrayList<>();
if (subjects.contains(c)) if (subjects.contains(c))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_SUBJECT,
CLASS_NAME_BULKTAG_SUBJECT,
TAGGING_TRUST));
if (datasources.contains(c)) if (datasources.contains(c))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE,
CLASS_ID_DATASOURCE,
CLASS_NAME_BULKTAG_DATASOURCE,
TAGGING_TRUST));
if (czenodo.contains(c)) if (czenodo.contains(c))
dataInfoList dataInfoList
.add( .add(OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
getDataInfo( OafMapperUtils.qualifier(CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
BULKTAG_DATA_INFO_TYPE, if (aconstraints.contains(c))
CLASS_ID_CZENODO, dataInfoList
CLASS_NAME_BULKTAG_ZENODO, .add(
TAGGING_TRUST)); OafMapperUtils.dataInfo(false, BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils.qualifier(CLASS_ID_ADVANCED_CONSTRAINT, CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST));
context.setDataInfo(dataInfoList); context.setDataInfo(dataInfoList);
return context; return context;
}) })
@ -226,22 +223,4 @@ public class ResultTagger implements Serializable {
return result; return result;
} }
public static DataInfo getDataInfo(
String inference_provenance, String inference_class_id, String inference_class_name, String trust) {
DataInfo di = new DataInfo();
di.setInferred(true);
di.setInferenceprovenance(inference_provenance);
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
di.setTrust(trust);
return di;
}
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_PROVENANCE_ACTIONS);
pa.setSchemename(DNET_PROVENANCE_ACTIONS);
return pa;
}
} }

View File

@ -11,12 +11,14 @@ public class TaggingConstants {
public static final String CLASS_ID_SUBJECT = "community:subject"; public static final String CLASS_ID_SUBJECT = "community:subject";
public static final String CLASS_ID_DATASOURCE = "community:datasource"; public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource"; public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo"; public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints";
public static final String TAGGING_TRUST = "0.8"; public static final String TAGGING_TRUST = "0.8";
} }

View File

@ -6,7 +6,16 @@ import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.bulktag.community.ProtoMap;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -24,11 +33,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
public class BulkTagJobTest { public class BulkTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -39,7 +43,8 @@ public class BulkTagJobTest {
+ " \"title\" : \"$['title'][*]['value']\"," + " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"; + " \"description\" : \"$['description'][*]['value']\", "
+" \"subject\" :\"$['subject'][*]['value']\" }";
private static SparkSession spark; private static SparkSession spark;
@ -763,10 +768,28 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query); org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false); idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count()); Assertions.assertEquals(4, idExplodeCommunity.count());
Assertions Assertions
.assertEquals( .assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
} }
// @Test
// void test1(){
// ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
// HashMap<String, String> param = new HashMap<>();
// for (String key : params.keySet()) {
// try {
// param.put(key, jsonContext.read(params.get(key)));
// } catch (com.jayway.jsonpath.PathNotFoundException e) {
// param.put(key, new ArrayList<>());
// }
// }
// return param;
// }
// }
} }

View File

@ -1193,6 +1193,9 @@
<organizations/> <organizations/>
</community> </community>
<community id="science-innovation-policy"> <community id="science-innovation-policy">
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"},
{"verb":"equals","field":"subject","value":"Miriam"}]},
{"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]}</advancedConstraints>
<subjects> <subjects>
<subject>Sustainability-oriented science policy</subject> <subject>Sustainability-oriented science policy</subject>
<subject> STI policies</subject> <subject> STI policies</subject>
@ -1316,7 +1319,7 @@
<openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId> <openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, <selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]}
</selcriteria> </selcriteria>
</datasource> </datasource>
<datasource> <datasource>