forked from D-Net/dnet-hadoop
refactoring
This commit is contained in:
parent
34172455d1
commit
9567c13bc3
|
@ -28,7 +28,7 @@ public class CommunityConfiguration implements Serializable {
|
|||
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
|
||||
// map eosc datasource -> communityid
|
||||
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
|
||||
//map communityid -> remove constraints
|
||||
// map communityid -> remove constraints
|
||||
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
|
||||
|
||||
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
|
||||
|
|
|
@ -109,7 +109,7 @@ public class CommunityConfigurationFactory {
|
|||
return new SelectionConstraints();
|
||||
}
|
||||
SelectionConstraints selectionConstraints = new Gson()
|
||||
.fromJson(constsNode.getText(), SelectionConstraints.class);
|
||||
.fromJson(constsNode.getText(), SelectionConstraints.class);
|
||||
|
||||
selectionConstraints.setSelection(resolver);
|
||||
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
|
||||
|
|
|
@ -83,18 +83,18 @@ public class ResultTagger implements Serializable {
|
|||
final Set<String> removeCommunities = new HashSet<>();
|
||||
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.keySet()
|
||||
.forEach(communityId -> {
|
||||
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.get(communityId)
|
||||
.getCriteria()
|
||||
.stream()
|
||||
.anyMatch(crit -> crit.verifyCriteria(param)))
|
||||
removeCommunities.add(communityId);
|
||||
});
|
||||
.getRemoveConstraintsMap()
|
||||
.keySet()
|
||||
.forEach(communityId -> {
|
||||
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.get(communityId)
|
||||
.getCriteria()
|
||||
.stream()
|
||||
.anyMatch(crit -> crit.verifyCriteria(param)))
|
||||
removeCommunities.add(communityId);
|
||||
});
|
||||
|
||||
// communities contains all the communities to be added as context for the result
|
||||
final Set<String> communities = new HashSet<>();
|
||||
|
@ -182,7 +182,7 @@ public class ResultTagger implements Serializable {
|
|||
.keySet()
|
||||
.forEach(communityId -> {
|
||||
if (!removeCommunities.contains(communityId) &&
|
||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf
|
||||
.getSelectionConstraintsMap()
|
||||
.get(communityId)
|
||||
|
|
|
@ -40,9 +40,9 @@ public class BulkTagJobTest {
|
|||
+ " \"description\" : \"$['description'][*]['value']\", "
|
||||
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
|
||||
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
|
||||
"\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
|
||||
"\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
|
||||
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
|
||||
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
|
||||
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
|
@ -1527,43 +1527,44 @@ public class BulkTagJobTest {
|
|||
.count());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void removeTest() throws Exception {
|
||||
final String pathMap = BulkTagJobTest.pathMap;
|
||||
SparkBulkTagJob
|
||||
.main(
|
||||
new String[]{
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints").getPath(),
|
||||
"-taggingConf", taggingConf,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/dataset",
|
||||
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
|
||||
"-pathMap", pathMap
|
||||
});
|
||||
.main(
|
||||
new String[] {
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
|
||||
.getPath(),
|
||||
"-taggingConf", taggingConf,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/dataset",
|
||||
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
|
||||
"-pathMap", pathMap
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Dataset> tmp = sc
|
||||
.textFile(workingDir.toString() + "/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
.textFile(workingDir.toString() + "/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(12, tmp.count());
|
||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("dataset");
|
||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||
+ "from dataset "
|
||||
+ "lateral view explode(context) c as MyT "
|
||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||
+ "from dataset "
|
||||
+ "lateral view explode(context) c as MyT "
|
||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue