1
0
Fork 0

[patents] added test and resources

This commit is contained in:
Miriam Baglioni 2024-11-22 17:21:58 +01:00
parent e5b04e61ff
commit 821700299a
4 changed files with 147 additions and 38 deletions

View File

@ -37,7 +37,8 @@ public class PrepareResultCommunitySetStep1 {
* relation
*/
// TODO
private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context "
private static final String RESULT_CONTEXT_QUERY_TEMPLATE =
"select target resultId, community_context "
+ "from (select id, collect_set(co.id) community_context "
+ " from result "
+ " lateral view explode (context) c as co "
@ -59,15 +60,26 @@ public class PrepareResultCommunitySetStep1 {
+ "where length(co) > 0 "
+ "group by resultId";
private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = "select target resultId, community_context "
+ "from (select id, collect_set(co.id) community_context "
+ " from result "
+ " lateral view explode (context) c as co "
+ " where datainfo.deletedbyinference = false %s "
+ " and array_contains(instance.instancetype.classname, 'Patent') group by id) p "
+ " JOIN "
+ " (select source, target from relation "
+ " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source";
private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO =
"select target as resultId, community_context " +
"from resultWithContext rwc " +
"join relatedToRelations r " +
"join patents p " +
"on rwc.id = r.source and r.target = p.id";
private static final String RESULT_WITH_CONTEXT = "select id, collect_set(co.id) community_context \n" +
" from result " +
" lateral view explode (context) c as co " +
" where datainfo.deletedbyinference = false AND lower(co.id) IN %s" +
" group by id";
private static final String RESULT_PATENT = "select id " +
" from result " +
" where array_contains(instance.instancetype.classname, 'Patent')";
private static final String IS_RELATED_TO_RELATIONS = "select source, target " +
" from relation " +
" where lower(relClass) = 'isrelatedto' and datainfo.deletedbyinference = false";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
@ -95,20 +107,18 @@ public class PrepareResultCommunitySetStep1 {
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";"))
.map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new));
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final String allowedsemrel ="(" + join(",",
Arrays.asList(parser.get("allowedsemrels").split(";")).stream().map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new)) + ")";
log.info("allowedSemRel: {}", allowedsemrel);
final String baseURL = parser.get("baseURL");
log.info("baseURL: {}", baseURL);
final String communityIdList = join(",", getCommunityList(baseURL).stream()
final String communityIdList = "(" + join(",", getCommunityList(baseURL).stream()
.map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new));
.toArray(String[]::new)) + ")";
log.info("communityIdList: {}", new Gson().toJson(communityIdList));
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
@ -156,32 +166,38 @@ public class PrepareResultCommunitySetStep1 {
final String outputResultPath = outputPath + "/" + resultType;
log.info("writing output results to: {}", outputResultPath);
String resultContextQuery = String
.format(
RESULT_CONTEXT_QUERY_TEMPLATE,
" lower(co.id) IN " + communityIdList,
" AND lower(relClass) IN " + allowedsemrel);
String resultContextQueryIsRelatedTo = String
.format(
RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO,
" AND lower(co.id) IN " + communityIdList,
"AND lower(relClass) = '"+
ModelConstants.IS_RELATED_TO.toLowerCase() + "'");
"AND lower(co.id) IN " + communityIdList,
"AND lower(relClass) IN " + allowedsemrel);
Dataset<Row> result_context = spark.sql(resultContextQuery);
//result_context.createOrReplaceTempView("result_context");
// spark
// .sql(RESULT_COMMUNITY_LIST_QUERY)
// .as(Encoders.bean(ResultCommunityList.class))
// .write()
// .option("compression", "gzip")
// .mode(SaveMode.Overwrite)
// .json(outputResultPath);
Dataset<Row> rwc = spark.sql(String.format(RESULT_WITH_CONTEXT, communityIdList));
Dataset<Row> patents = spark.sql(RESULT_PATENT);
Dataset<Row> relatedToRelations = spark.sql(IS_RELATED_TO_RELATIONS);
rwc.createOrReplaceTempView("resultWithContext");
patents.createOrReplaceTempView("patents");
relatedToRelations.createOrReplaceTempView("relatedTorelations");
result_context = result_context.unionAll( spark.sql(RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO));
result_context.createOrReplaceTempView("result_context");
spark
.sql(RESULT_COMMUNITY_LIST_QUERY)
.as(Encoders.bean(ResultCommunityList.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputResultPath);
result_context = spark.sql(resultContextQueryIsRelatedTo);
result_context.createOrReplaceTempView("result_context");
spark
.sql(RESULT_COMMUNITY_LIST_QUERY)
.as(Encoders.bean(ResultCommunityList.class))
@ -189,6 +205,7 @@ public class PrepareResultCommunitySetStep1 {
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputResultPath);
}
public static List<String> getCommunityList(final String baseURL) throws IOException {

View File

@ -6,8 +6,11 @@ import static org.apache.spark.sql.functions.desc;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -25,6 +28,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import scala.collection.Seq;
public class ResultToCommunityJobTest {
@ -271,4 +275,55 @@ public class ResultToCommunityJobTest {
.get(0)
.getString(0));
}
@Test
public void prepareStep1Test() throws Exception {
/*
final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";"))
.map(value -> "'" + value.toLowerCase() + "'")
.toArray(String[]::new));
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final String baseURL = parser.get("baseURL");
log.info("baseURL: {}", baseURL);
*/
PrepareResultCommunitySetStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", getClass()
.getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph")
.getPath(),
"-hive_metastore_uris", "",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels","issupplementto;issupplementedby",
"-baseURL","https://dev-openaire.d4science.org/openaire/community/"
});
org.apache.spark.sql.Dataset<ResultCommunityList> resultCommunityList = spark.read().schema(Encoders.bean(ResultCommunityList.class).schema())
.json(workingDir.toString() + "/preparedInfo/publication")
.as(Encoders.bean(ResultCommunityList.class));
Assertions.assertEquals(2, resultCommunityList.count());
Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'").count());
Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'").count());
ArrayList<String> communities = resultCommunityList
.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'")
.first().getCommunityList();
Assertions.assertEquals(2, communities.size());
Assertions.assertTrue(communities.stream().anyMatch(cid -> "beopen".equals(cid)));
Assertions.assertTrue(communities.stream().anyMatch(cid -> "dh-ch".equals(cid)));
communities = resultCommunityList
.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'")
.first().getCommunityList();
Assertions.assertEquals(1, communities.size());
Assertions.assertEquals("dh-ch", communities.get(0));
}
}

View File

@ -0,0 +1,24 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|pending_org_::82f63b2d21ae88596b9d8991780e9888","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","subRelType":"affiliation","target":"20|openorgs____::322ff2a6524820640bc5d1311871585e","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","subRelType":"affiliation","target":"20|openorgs____::58e60f1715d219aa6757ba0b0f2ccbce","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","source":"10|issn___print::a7a2010e75d849442790955162ef4e42","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e43","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e44","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e45","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|355e65625b88::74009c567c81b4aa55c813db658734df","subRelType":"affiliation","target":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","subRelType":"affiliation","target":"20|openorgs____::91a81877815afb4ebf25c1a3f3b03c5d","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","subRelType":"affiliation","target":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","validated":false}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false}