adding context information to projects and datasources #407

Merged
claudio.atzori merged 8 commits from taggingProjects into beta 2024-03-26 14:53:39 +01:00
6 changed files with 95 additions and 9 deletions
Showing only changes of commit 83bb97be83 - Show all commits

View File

@ -10,6 +10,7 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -177,6 +178,8 @@ public class Utils implements Serializable {
public static List<EntityCommunities> getDatasourceCommunities(String baseURL)throws IOException{
List<CommunityModel> validCommunities = getValidCommunities(baseURL);
HashMap<String, Set<String>> map = new HashMap<>();
String entityPrefix = ModelSupport.getIdPrefix(Datasource.class) + "|" ;
validCommunities.forEach(c -> {
try {
new ObjectMapper().readValue(QueryCommunityAPI.communityDatasource(c.getId(), baseURL), DatasourceList.class)
@ -191,10 +194,18 @@ public class Utils implements Serializable {
}
});
List<EntityCommunities> temp = map.keySet().stream()
.map(k -> EntityCommunities.newInstance(entityPrefix + k, getCollect(k, map))).collect(Collectors.toList());
return map.keySet().stream().map(k -> EntityCommunities.newInstance(k, map.get(k).stream().collect(Collectors.toList()))).collect(Collectors.toList());
return temp;
}
@NotNull
private static List<String> getCollect(String k, HashMap<String, Set<String>> map) {
List<String> temp = map.get(k).stream().collect(Collectors.toList());
return temp;
}
}

View File

@ -9,7 +9,6 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
import eu.dnetlib.dhp.api.model.EntityCommunities;
import eu.dnetlib.dhp.api.model.DatasourceCommunitiesList;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Project;
@ -109,14 +108,20 @@ public class SparkBulkTagJob {
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
Project ds = t2._1();
if (t2._2() != null){
List<String> context = ds.getContext().stream().map(c -> c.getId()).collect(Collectors.toList());
List<String> context =
Optional.ofNullable(ds.getContext())
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
.orElse(new ArrayList<>());
if(!Optional.ofNullable(ds.getContext()).isPresent())
ds.setContext(new ArrayList<>());
t2._2().getCommunitiesId().forEach(c -> {
if(!context.contains(c)){
Context con = new Context();
con.setId(c);
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1")));
ds.getContext().add(con)
ds.getContext().add(con);
}
});
}
@ -139,19 +144,27 @@ public class SparkBulkTagJob {
Dataset<Datasource> datasource = readPath(spark, inputPath + "datasource", Datasource.class);
Dataset<EntityCommunities> dc = spark.createDataset(datasourceCommunities, Encoders.bean(EntityCommunities.class));
datasource.joinWith(dc, datasource.col("id").equalTo(dc.col("entityId")), "left")
.map((MapFunction<Tuple2<Datasource, EntityCommunities>, Datasource>) t2 -> {
Datasource ds = t2._1();
if (t2._2() != null){
List<String> context = ds.getContext().stream().map(c -> c.getId()).collect(Collectors.toList());
List<String> context =
Optional.ofNullable(ds.getContext())
.map(v -> v.stream().map(c -> c.getId()).collect(Collectors.toList()))
.orElse(new ArrayList<>());
if(!Optional.ofNullable(ds.getContext()).isPresent())
ds.setContext(new ArrayList<>());
t2._2().getCommunitiesId().forEach(c -> {
if(!context.contains(c)){
Context con = new Context();
con.setId(c);
con.setDataInfo(Arrays.asList(OafMapperUtils.dataInfo(false,TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils.qualifier(TaggingConstants.CLASS_ID_DATASOURCE, TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "1")));
ds.getContext().add(con)
ds.getContext().add(con);
}
});
}

View File

@ -33,6 +33,6 @@
"paramName": "bu",
"paramLongName": "baseURL",
"paramDescription": "this parameter is to specify the api to be queried (beta or production)",
"paramRequired": false
"paramRequired": true
}
]

View File

@ -325,7 +325,7 @@ public class BulkTagJobTest {
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", pathMap
});
@ -349,6 +349,8 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(5, idExplodeCommunity.count());
Assertions
.assertEquals(
@ -383,6 +385,63 @@ public class BulkTagJobTest {
.count());
}
@Test
void datasourceTag() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath();
SparkBulkTagJob
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", pathMap
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Datasource> tmp = sc
.textFile(workingDir.toString() + "/datasource")
.map(item -> OBJECT_MAPPER.readValue(item, Datasource.class));
Assertions.assertEquals(3, tmp.count());
org.apache.spark.sql.Dataset<Datasource> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Datasource.class));
verificationDataset.createOrReplaceTempView("datasource");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from datasource "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Datasource'")
.count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
}
@Test
void bulktagByZenodoCommunityTest() throws Exception {
final String sourcePath = getClass()