minor changes

This commit is contained in:
Michele De Bonis 2024-09-30 17:44:50 +02:00
parent 9b3a9b2381
commit 9f66e426db
5 changed files with 45 additions and 323 deletions

View File

@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
class DecisionTreeTest { class DecisionTreeTest {
@Test
void testJPath() throws IOException {
DedupConfig conf = DedupConfig
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}
@Test @Test
void organizationDecisionTreeTest() throws Exception { void organizationDecisionTreeTest() throws Exception {
DedupConfig conf = DedupConfig DedupConfig conf = DedupConfig

View File

@ -15,6 +15,28 @@ import eu.dnetlib.pace.model.SparkModel;
class JsonPathTest { class JsonPathTest {
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}
@Test @Test
void testJPath() throws IOException { void testJPath() throws IOException {
@ -29,29 +51,7 @@ class JsonPathTest {
Assertions.assertNotNull(row); Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier"))); Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle")); System.out.println("row = " + row.getAs("country"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
} }
@Test @Test

View File

@ -49,18 +49,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
static DedupConfig config; static DedupConfig config;
static JavaSparkContext context; static JavaSparkContext context;
// final String entitiesPath = Paths final String entitiesPath = Paths
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI()) .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
// .toFile() .toFile()
// .getAbsolutePath(); .getAbsolutePath();
// final String dedupConfPath = Paths final String dedupConfPath = Paths
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI()) .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
// .toFile() .toFile()
// .getAbsolutePath(); .getAbsolutePath();
final String entitiesPath = "/Users/miconis/Desktop/pub_to_fix1.json";
final String dedupConfPath = "/Users/miconis/Desktop/pub.conf.json";
final static int MAX_ACCEPTANCE_DATE = 20; final static int MAX_ACCEPTANCE_DATE = 20;

View File

@ -5,7 +5,7 @@
"entityType" : "organization", "entityType" : "organization",
"subEntityValue": "organization", "subEntityValue": "organization",
"orderField" : "legalname", "orderField" : "legalname",
"queueMaxSize" : "2000", "queueMaxSize" : "100000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"idPath":"$.id", "idPath":"$.id",
@ -24,14 +24,18 @@
"start": { "start": {
"fields": [ "fields": [
{ {
"field": "gridid", "field": "pid",
"comparator": "exactMatch", "comparator": "jsonListMatch",
"weight": 1, "weight": 1,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": {} "params": {
"mode": "percentage",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
} }
], ],
"threshold": 1, "threshold": 0.5,
"aggregation": "AVG", "aggregation": "AVG",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
@ -49,7 +53,7 @@
}, },
{ {
"field": "country", "field": "country",
"comparator": "exactMatch", "comparator": "countryMatch",
"weight": 1, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": {} "params": {}
@ -144,18 +148,18 @@
} }
}, },
"model" : [ "model" : [
{ "name" : "pid", "type": "JSON", "path" : "$.pid[*]"},
{ "name" : "country", "type" : "String", "path" : "$.country.classid"}, { "name" : "country", "type" : "String", "path" : "$.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" } { "name" : "originalId", "type" : "String", "path" : "$.id" }
], ],
"blacklists" : { "blacklists" : {
"legalname" : [] "legalname" : []
}, },
"synonyms": { "synonyms": {
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -164,7 +168,7 @@
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"], "key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],