forked from D-Net/dnet-hadoop
added test to verify the advconstraints to dth community. inserted some additional logs.
This commit is contained in:
parent
63b8bbc015
commit
b25b401065
|
@ -37,7 +37,7 @@ public class CommunityConfigurationFactory {
|
|||
|
||||
public static CommunityConfiguration newInstance(final String xml) throws DocumentException, SAXException {
|
||||
|
||||
log.debug(String.format("parsing community configuration from:\n%s", xml));
|
||||
log.info(String.format("parsing community configuration from:\n%s", xml));
|
||||
|
||||
final SAXReader reader = new SAXReader();
|
||||
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
|
||||
|
@ -98,6 +98,7 @@ public class CommunityConfigurationFactory {
|
|||
.fromJson(advConstsNode.getText(), SelectionConstraints.class);
|
||||
|
||||
selectionConstraints.setSelection(resolver);
|
||||
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
|
||||
return selectionConstraints;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,11 +10,14 @@ import java.util.stream.Collectors;
|
|||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
@ -22,6 +25,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
|||
|
||||
/** Created by miriam on 02/08/2018. */
|
||||
public class ResultTagger implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(ResultTagger.class);
|
||||
|
||||
private boolean clearContext(Result result) {
|
||||
int tmp = result.getContext().size();
|
||||
|
@ -149,6 +153,8 @@ public class ResultTagger implements Serializable {
|
|||
});
|
||||
|
||||
communities.addAll(aconstraints);
|
||||
if (aconstraints.size() > 0)
|
||||
log.info("Found {} for advancedConstraints ", aconstraints.size());
|
||||
|
||||
clearContext(result);
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag.criteria;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Locale;
|
||||
|
||||
@VerbClass("starts_with_caseinsensitive")
|
||||
public class StartsWithVerbIgnoreCase implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
||||
public StartsWithVerbIgnoreCase() {
|
||||
}
|
||||
|
||||
public StartsWithVerbIgnoreCase(final String param) {
|
||||
this.param = param;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean apply(String value) {
|
||||
return value.toLowerCase().startsWith(param.toLowerCase());
|
||||
}
|
||||
|
||||
public String getParam() {
|
||||
return param;
|
||||
}
|
||||
|
||||
public void setParam(String param) {
|
||||
this.param = param;
|
||||
}
|
||||
}
|
|
@ -219,7 +219,7 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="End"/>
|
||||
<join name="wait" to="eosc_tag"/>
|
||||
|
||||
<action name="eosc_tag">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
|
|
@ -758,7 +758,7 @@ public class BulkTagJobTest {
|
|||
.textFile(workingDir.toString() + "/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
Assertions.assertEquals(12, tmp.count());
|
||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||
|
||||
|
@ -772,14 +772,14 @@ public class BulkTagJobTest {
|
|||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
Assertions.assertEquals(5, idExplodeCommunity.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
|
||||
// Assertions.assertEquals(5, idExplodeCommunity.count());
|
||||
//
|
||||
// Assertions
|
||||
// .assertEquals(
|
||||
// 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
||||
// Assertions
|
||||
// .assertEquals(
|
||||
// 2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -843,88 +843,136 @@
|
|||
</zenodocommunities>
|
||||
<organizations/>
|
||||
</community>
|
||||
<community id="dariah">
|
||||
<community id="dth">
|
||||
<advancedConstraints>
|
||||
{
|
||||
"criteria": [
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "North America"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "05"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "North America"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "06"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Mexico"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "United States"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Canada"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "05"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Mexico"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "United States"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Canada"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "06"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
{"criteria":[
|
||||
{"constraint":[
|
||||
{"verb":"equals_caseinsensitive","field":"subject","value":"digital twins"},
|
||||
{"verb":"contains_caseinsensitive","field":"subject","value":"health"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"structural"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"marine"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"avionics"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"battery"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"contains_caseinsensitive","field":"title","value":"Human Digital Twins"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"contains_caseinsensitive","field":"description","value":"Human Digital Twins"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"equals_caseinsensitive","field":"subject","value":"Human Digital Twins"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"contains_caseinsensitive","field":"title","value":"Virtual Human Twin"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"contains_caseinsensitive","field":"description","value":"Virtual Human Twin"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"equals_caseinsensitive","field":"subject","value":"Virtual Human Twin"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"equals_caseinsensitive","field":"subject","value":"digital twin"},
|
||||
{"verb":"contains_caseinsensitive","field":"subject","value":"health"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"structural"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"marine"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"avionics"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"battery"}
|
||||
]},
|
||||
{"constraint":[
|
||||
{"verb":"contains_caseinsensitive","field":"title","value":"digital twin health"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Acoustic"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Health Monitoring"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Monitoring"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Management"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Health Assessment"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Assessment"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health status"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"ELECTRICAL ENGINEERING"},
|
||||
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Control and Systems Engineering"}
|
||||
]}
|
||||
]}
|
||||
<!-- {-->
|
||||
<!-- "criteria": [-->
|
||||
<!-- {-->
|
||||
<!-- "constraint": [-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "North America"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "contains",-->
|
||||
<!-- "field": "fos",-->
|
||||
<!-- "value": "05"-->
|
||||
<!-- }-->
|
||||
<!-- ]-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "constraint": [-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "North America"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "contains",-->
|
||||
<!-- "field": "fos",-->
|
||||
<!-- "value": "06"-->
|
||||
<!-- }-->
|
||||
<!-- ]-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "constraint": [-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "Mexico"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "United States"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "Canada"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "contains",-->
|
||||
<!-- "field": "fos",-->
|
||||
<!-- "value": "05"-->
|
||||
<!-- }-->
|
||||
<!-- ]-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "constraint": [-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "Mexico"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "United States"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "equals_caseinsensitive",-->
|
||||
<!-- "field": "subject",-->
|
||||
<!-- "value": "Canada"-->
|
||||
<!-- },-->
|
||||
<!-- {-->
|
||||
<!-- "verb": "contains",-->
|
||||
<!-- "field": "fos",-->
|
||||
<!-- "value": "06"-->
|
||||
<!-- }-->
|
||||
<!-- ]-->
|
||||
<!-- }-->
|
||||
<!-- ]-->
|
||||
<!-- }-->
|
||||
|
||||
</advancedConstraints>
|
||||
<subjects/>
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue