forked from D-Net/dnet-hadoop
[BulkTag] Adding remove constraints to specify when a community must not appear in the context of a result.
This commit is contained in:
parent
a235d2a24a
commit
34172455d1
|
@ -15,6 +15,7 @@ public class Community implements Serializable {
|
||||||
private List<Provider> providers = new ArrayList<>();
|
private List<Provider> providers = new ArrayList<>();
|
||||||
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
|
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
|
||||||
private SelectionConstraints constraints = new SelectionConstraints();
|
private SelectionConstraints constraints = new SelectionConstraints();
|
||||||
|
private SelectionConstraints removeConstraints = new SelectionConstraints();
|
||||||
|
|
||||||
public String toJson() {
|
public String toJson() {
|
||||||
final Gson g = new Gson();
|
final Gson g = new Gson();
|
||||||
|
@ -67,4 +68,12 @@ public class Community implements Serializable {
|
||||||
public void setConstraints(SelectionConstraints constraints) {
|
public void setConstraints(SelectionConstraints constraints) {
|
||||||
this.constraints = constraints;
|
this.constraints = constraints;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SelectionConstraints getRemoveConstraints() {
|
||||||
|
return removeConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRemoveConstraints(SelectionConstraints removeConstraints) {
|
||||||
|
this.removeConstraints = removeConstraints;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,8 @@ public class CommunityConfiguration implements Serializable {
|
||||||
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
|
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
|
||||||
// map eosc datasource -> communityid
|
// map eosc datasource -> communityid
|
||||||
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
|
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
|
||||||
|
//map communityid -> remove constraints
|
||||||
|
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
|
||||||
|
|
||||||
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
|
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
|
||||||
return eoscDatasourceMap;
|
return eoscDatasourceMap;
|
||||||
|
@ -71,6 +73,14 @@ public class CommunityConfiguration implements Serializable {
|
||||||
this.selectionConstraintsMap = selectionConstraintsMap;
|
this.selectionConstraintsMap = selectionConstraintsMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Map<String, SelectionConstraints> getRemoveConstraintsMap() {
|
||||||
|
return removeConstraintsMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRemoveConstraintsMap(Map<String, SelectionConstraints> removeConstraintsMap) {
|
||||||
|
this.removeConstraintsMap = removeConstraintsMap;
|
||||||
|
}
|
||||||
|
|
||||||
CommunityConfiguration(final Map<String, Community> communities) {
|
CommunityConfiguration(final Map<String, Community> communities) {
|
||||||
this.communities = communities;
|
this.communities = communities;
|
||||||
init();
|
init();
|
||||||
|
@ -90,6 +100,9 @@ public class CommunityConfiguration implements Serializable {
|
||||||
if (selectionConstraintsMap == null) {
|
if (selectionConstraintsMap == null) {
|
||||||
selectionConstraintsMap = Maps.newHashMap();
|
selectionConstraintsMap = Maps.newHashMap();
|
||||||
}
|
}
|
||||||
|
if (removeConstraintsMap == null) {
|
||||||
|
removeConstraintsMap = Maps.newHashMap();
|
||||||
|
}
|
||||||
|
|
||||||
for (Community c : getCommunities().values()) {
|
for (Community c : getCommunities().values()) {
|
||||||
// get subjects
|
// get subjects
|
||||||
|
@ -111,6 +124,8 @@ public class CommunityConfiguration implements Serializable {
|
||||||
zenodocommunityMap);
|
zenodocommunityMap);
|
||||||
}
|
}
|
||||||
selectionConstraintsMap.put(id, c.getConstraints());
|
selectionConstraintsMap.put(id, c.getConstraints());
|
||||||
|
|
||||||
|
removeConstraintsMap.put(id, c.getRemoveConstraints());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -86,6 +86,7 @@ public class CommunityConfigurationFactory {
|
||||||
c.setProviders(parseDatasources(node));
|
c.setProviders(parseDatasources(node));
|
||||||
c.setZenodoCommunities(parseZenodoCommunities(node));
|
c.setZenodoCommunities(parseZenodoCommunities(node));
|
||||||
c.setConstraints(parseConstrains(node));
|
c.setConstraints(parseConstrains(node));
|
||||||
|
c.setRemoveConstraints(parseRemoveConstrains(node));
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,6 +103,19 @@ public class CommunityConfigurationFactory {
|
||||||
return selectionConstraints;
|
return selectionConstraints;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static SelectionConstraints parseRemoveConstrains(Node node) {
|
||||||
|
Node constsNode = node.selectSingleNode("./removeConstraints");
|
||||||
|
if (constsNode == null || StringUtils.isBlank(StringUtils.trim(constsNode.getText()))) {
|
||||||
|
return new SelectionConstraints();
|
||||||
|
}
|
||||||
|
SelectionConstraints selectionConstraints = new Gson()
|
||||||
|
.fromJson(constsNode.getText(), SelectionConstraints.class);
|
||||||
|
|
||||||
|
selectionConstraints.setSelection(resolver);
|
||||||
|
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
|
||||||
|
return selectionConstraints;
|
||||||
|
}
|
||||||
|
|
||||||
private static List<String> parseSubjects(final Node node) {
|
private static List<String> parseSubjects(final Node node) {
|
||||||
|
|
||||||
final List<String> subjects = Lists.newArrayList();
|
final List<String> subjects = Lists.newArrayList();
|
||||||
|
|
|
@ -79,6 +79,23 @@ public class ResultTagger implements Serializable {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// communities contains all the communities to be not added to the context
|
||||||
|
final Set<String> removeCommunities = new HashSet<>();
|
||||||
|
|
||||||
|
conf
|
||||||
|
.getRemoveConstraintsMap()
|
||||||
|
.keySet()
|
||||||
|
.forEach(communityId -> {
|
||||||
|
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
|
conf
|
||||||
|
.getRemoveConstraintsMap()
|
||||||
|
.get(communityId)
|
||||||
|
.getCriteria()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(crit -> crit.verifyCriteria(param)))
|
||||||
|
removeCommunities.add(communityId);
|
||||||
|
});
|
||||||
|
|
||||||
// communities contains all the communities to be added as context for the result
|
// communities contains all the communities to be added as context for the result
|
||||||
final Set<String> communities = new HashSet<>();
|
final Set<String> communities = new HashSet<>();
|
||||||
|
|
||||||
|
@ -164,7 +181,8 @@ public class ResultTagger implements Serializable {
|
||||||
.getSelectionConstraintsMap()
|
.getSelectionConstraintsMap()
|
||||||
.keySet()
|
.keySet()
|
||||||
.forEach(communityId -> {
|
.forEach(communityId -> {
|
||||||
if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
if (!removeCommunities.contains(communityId) &&
|
||||||
|
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
conf
|
conf
|
||||||
.getSelectionConstraintsMap()
|
.getSelectionConstraintsMap()
|
||||||
.get(communityId)
|
.get(communityId)
|
||||||
|
@ -175,6 +193,9 @@ public class ResultTagger implements Serializable {
|
||||||
});
|
});
|
||||||
|
|
||||||
communities.addAll(aconstraints);
|
communities.addAll(aconstraints);
|
||||||
|
|
||||||
|
communities.removeAll(removeCommunities);
|
||||||
|
|
||||||
if (aconstraints.size() > 0)
|
if (aconstraints.size() > 0)
|
||||||
log.info("Found {} for advancedConstraints ", aconstraints.size());
|
log.info("Found {} for advancedConstraints ", aconstraints.size());
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,9 @@ where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//con
|
||||||
return
|
return
|
||||||
<community>
|
<community>
|
||||||
{ $x//CONFIGURATION/context/@id}
|
{ $x//CONFIGURATION/context/@id}
|
||||||
|
<removeConstraints>
|
||||||
|
{$x//CONFIGURATION/context/param[./@name='removeConstraints']/text() }
|
||||||
|
</removeConstraints>
|
||||||
<advancedConstraints>
|
<advancedConstraints>
|
||||||
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
|
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
|
||||||
</advancedConstraints>
|
</advancedConstraints>
|
||||||
|
|
|
@ -39,8 +39,10 @@ public class BulkTagJobTest {
|
||||||
+ " \"contributor\" : \"$['contributor'][*]['value']\","
|
+ " \"contributor\" : \"$['contributor'][*]['value']\","
|
||||||
+ " \"description\" : \"$['description'][*]['value']\", "
|
+ " \"description\" : \"$['description'][*]['value']\", "
|
||||||
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
|
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
|
||||||
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" +
|
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
|
||||||
"} ";
|
"\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
|
||||||
|
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
|
||||||
|
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
|
||||||
|
|
||||||
private static SparkSession spark;
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
@ -56,7 +58,7 @@ public class BulkTagJobTest {
|
||||||
.toString(
|
.toString(
|
||||||
BulkTagJobTest.class
|
BulkTagJobTest.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml"));
|
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_remove.xml"));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -1525,4 +1527,43 @@ public class BulkTagJobTest {
|
||||||
.count());
|
.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
@Test
|
||||||
|
void removeTest() throws Exception {
|
||||||
|
final String pathMap = BulkTagJobTest.pathMap;
|
||||||
|
SparkBulkTagJob
|
||||||
|
.main(
|
||||||
|
new String[]{
|
||||||
|
"-isTest", Boolean.TRUE.toString(),
|
||||||
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints").getPath(),
|
||||||
|
"-taggingConf", taggingConf,
|
||||||
|
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||||
|
"-outputPath", workingDir.toString() + "/dataset",
|
||||||
|
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
|
||||||
|
"-pathMap", pathMap
|
||||||
|
});
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Dataset> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/dataset")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||||
|
|
||||||
|
Assertions.assertEquals(12, tmp.count());
|
||||||
|
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
||||||
|
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||||
|
|
||||||
|
verificationDataset.createOrReplaceTempView("dataset");
|
||||||
|
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||||
|
+ "from dataset "
|
||||||
|
+ "lateral view explode(context) c as MyT "
|
||||||
|
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||||
|
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||||
|
|
||||||
|
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||||
|
|
||||||
|
idExplodeCommunity.show(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue