Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
8 changed files with 1665 additions and 7 deletions
Showing only changes of commit 34172455d1 - Show all commits

View File

@ -15,6 +15,7 @@ public class Community implements Serializable {
private List<Provider> providers = new ArrayList<>(); private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
private SelectionConstraints constraints = new SelectionConstraints(); private SelectionConstraints constraints = new SelectionConstraints();
private SelectionConstraints removeConstraints = new SelectionConstraints();
public String toJson() { public String toJson() {
final Gson g = new Gson(); final Gson g = new Gson();
@ -67,4 +68,12 @@ public class Community implements Serializable {
public void setConstraints(SelectionConstraints constraints) { public void setConstraints(SelectionConstraints constraints) {
this.constraints = constraints; this.constraints = constraints;
} }
public SelectionConstraints getRemoveConstraints() {
return removeConstraints;
}
public void setRemoveConstraints(SelectionConstraints removeConstraints) {
this.removeConstraints = removeConstraints;
}
} }

View File

@ -28,6 +28,8 @@ public class CommunityConfiguration implements Serializable {
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>(); private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
// map eosc datasource -> communityid // map eosc datasource -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>(); private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
//map communityid -> remove constraints
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() { public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
return eoscDatasourceMap; return eoscDatasourceMap;
@ -71,6 +73,14 @@ public class CommunityConfiguration implements Serializable {
this.selectionConstraintsMap = selectionConstraintsMap; this.selectionConstraintsMap = selectionConstraintsMap;
} }
public Map<String, SelectionConstraints> getRemoveConstraintsMap() {
return removeConstraintsMap;
}
public void setRemoveConstraintsMap(Map<String, SelectionConstraints> removeConstraintsMap) {
this.removeConstraintsMap = removeConstraintsMap;
}
CommunityConfiguration(final Map<String, Community> communities) { CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities; this.communities = communities;
init(); init();
@ -90,6 +100,9 @@ public class CommunityConfiguration implements Serializable {
if (selectionConstraintsMap == null) { if (selectionConstraintsMap == null) {
selectionConstraintsMap = Maps.newHashMap(); selectionConstraintsMap = Maps.newHashMap();
} }
if (removeConstraintsMap == null) {
removeConstraintsMap = Maps.newHashMap();
}
for (Community c : getCommunities().values()) { for (Community c : getCommunities().values()) {
// get subjects // get subjects
@ -111,6 +124,8 @@ public class CommunityConfiguration implements Serializable {
zenodocommunityMap); zenodocommunityMap);
} }
selectionConstraintsMap.put(id, c.getConstraints()); selectionConstraintsMap.put(id, c.getConstraints());
removeConstraintsMap.put(id, c.getRemoveConstraints());
} }
} }

View File

@ -86,6 +86,7 @@ public class CommunityConfigurationFactory {
c.setProviders(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
c.setConstraints(parseConstrains(node)); c.setConstraints(parseConstrains(node));
c.setRemoveConstraints(parseRemoveConstrains(node));
return c; return c;
} }
@ -102,6 +103,19 @@ public class CommunityConfigurationFactory {
return selectionConstraints; return selectionConstraints;
} }
private static SelectionConstraints parseRemoveConstrains(Node node) {
Node constsNode = node.selectSingleNode("./removeConstraints");
if (constsNode == null || StringUtils.isBlank(StringUtils.trim(constsNode.getText()))) {
return new SelectionConstraints();
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(constsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
return selectionConstraints;
}
private static List<String> parseSubjects(final Node node) { private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList(); final List<String> subjects = Lists.newArrayList();

View File

@ -79,6 +79,23 @@ public class ResultTagger implements Serializable {
break; break;
} }
// communities contains all the communities to be not added to the context
final Set<String> removeCommunities = new HashSet<>();
conf
.getRemoveConstraintsMap()
.keySet()
.forEach(communityId -> {
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
conf
.getRemoveConstraintsMap()
.get(communityId)
.getCriteria()
.stream()
.anyMatch(crit -> crit.verifyCriteria(param)))
removeCommunities.add(communityId);
});
// communities contains all the communities to be added as context for the result // communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>(); final Set<String> communities = new HashSet<>();
@ -164,7 +181,8 @@ public class ResultTagger implements Serializable {
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.keySet() .keySet()
.forEach(communityId -> { .forEach(communityId -> {
if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && if (!removeCommunities.contains(communityId) &&
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
conf conf
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.get(communityId) .get(communityId)
@ -175,6 +193,9 @@ public class ResultTagger implements Serializable {
}); });
communities.addAll(aconstraints); communities.addAll(aconstraints);
communities.removeAll(removeCommunities);
if (aconstraints.size() > 0) if (aconstraints.size() > 0)
log.info("Found {} for advancedConstraints ", aconstraints.size()); log.info("Found {} for advancedConstraints ", aconstraints.size());

View File

@ -10,6 +10,9 @@ where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//con
return return
<community> <community>
{ $x//CONFIGURATION/context/@id} { $x//CONFIGURATION/context/@id}
<removeConstraints>
{$x//CONFIGURATION/context/param[./@name='removeConstraints']/text() }
</removeConstraints>
<advancedConstraints> <advancedConstraints>
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() } {$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
</advancedConstraints> </advancedConstraints>

View File

@ -39,8 +39,10 @@ public class BulkTagJobTest {
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", " + " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" , " + + " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" + "\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
"} "; "\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
private static SparkSession spark; private static SparkSession spark;
@ -56,7 +58,7 @@ public class BulkTagJobTest {
.toString( .toString(
BulkTagJobTest.class BulkTagJobTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml")); "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_remove.xml"));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -1525,4 +1527,43 @@ public class BulkTagJobTest {
.count()); .count());
} }
}
@Test
void removeTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[]{
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints").getPath(),
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
}
}