Merge pull request 'advConstraintsInBeta' (#288) from advConstraintsInBeta into master

Reviewed-on: D-Net/dnet-hadoop#288
This commit is contained in:
Claudio Atzori 2023-04-06 15:24:23 +02:00
commit 4a4ca634f0
9 changed files with 180 additions and 9 deletions

View File

@ -58,9 +58,12 @@ public class MakeTarArchive implements Serializable {
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit, rename); makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit, rename);
} }
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit) throws IOException{
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit, false); makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit, false);
} }
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit, public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit,
boolean rename) boolean rename)
throws IOException { throws IOException {

View File

@ -24,7 +24,8 @@ public class Community implements Serializable {
public boolean isValid() { public boolean isValid() {
return !getSubjects().isEmpty() return !getSubjects().isEmpty()
|| !getProviders().isEmpty() || !getProviders().isEmpty()
|| !getZenodoCommunities().isEmpty(); || !getZenodoCommunities().isEmpty()
|| getConstraints().getCriteria() != null;
} }
public String getId() { public String getId() {

View File

@ -85,9 +85,23 @@ public class CommunityConfigurationFactory {
c.setSubjects(parseSubjects(node)); c.setSubjects(parseSubjects(node));
c.setProviders(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
c.setConstraints(parseConstrains(node));
return c; return c;
} }
private static SelectionConstraints parseConstrains(Node node) {
Node advConstsNode = node.selectSingleNode("./advancedConstraints");
if (advConstsNode == null || StringUtils.isBlank(StringUtils.trim(advConstsNode.getText()))) {
return new SelectionConstraints();
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(advConstsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
return selectionConstraints;
}
private static List<String> parseSubjects(final Node node) { private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList(); final List<String> subjects = Lists.newArrayList();

View File

@ -139,7 +139,7 @@ public class ResultTagger implements Serializable {
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.keySet() .keySet()
.forEach(communityId -> { .forEach(communityId -> {
if (conf.getSelectionConstraintsMap().get(communityId) != null && if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
conf conf
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.get(communityId) .get(communityId)

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 06/04/23
*/
@VerbClass("starts_with_caseinsensitive")
public class StartsWithIgnoreCaseVerb implements Selection, Serializable {
private String param;
public StartsWithIgnoreCaseVerb() {
}
public StartsWithIgnoreCaseVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.toLowerCase().startsWith(param.toLowerCase());
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 06/04/23
*/
@VerbClass("starts_with")
public class StartsWithVerb implements Selection, Serializable {
private String param;
public StartsWithVerb() {
}
public StartsWithVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.startsWith(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -39,7 +39,10 @@ public class BulkTagJobTest {
+ " \"title\" : \"$['title'][*]['value']\"," + " \"title\" : \"$['title'][*]['value']\","
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\"}"; + " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" +
"} ";
private static SparkSession spark; private static SparkSession spark;
@ -763,7 +766,7 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query); org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false); idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count()); Assertions.assertEquals(4, idExplodeCommunity.count());
Assertions Assertions
.assertEquals( .assertEquals(

View File

@ -844,6 +844,88 @@
<organizations/> <organizations/>
</community> </community>
<community id="dariah"> <community id="dariah">
<advancedConstraints>
{
"criteria": [
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
}
]
}
</advancedConstraints>
<subjects/> <subjects/>
<datasources> <datasources>
<datasource> <datasource>