minor changes

This commit is contained in:
Michele De Bonis 2024-09-30 17:44:50 +02:00
parent 9b3a9b2381
commit 9f66e426db
5 changed files with 45 additions and 323 deletions

View File

@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
class DecisionTreeTest {
@Test
void testJPath() throws IOException {
DedupConfig conf = DedupConfig
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}
@Test
void organizationDecisionTreeTest() throws Exception {
DedupConfig conf = DedupConfig

View File

@ -15,6 +15,28 @@ import eu.dnetlib.pace.model.SparkModel;
class JsonPathTest {
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}
@Test
void testJPath() throws IOException {
@ -29,29 +51,7 @@ class JsonPathTest {
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("country"));
}
@Test

View File

@ -49,18 +49,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
static DedupConfig config;
static JavaSparkContext context;
// final String entitiesPath = Paths
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
// .toFile()
// .getAbsolutePath();
final String entitiesPath = Paths
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
.toFile()
.getAbsolutePath();
// final String dedupConfPath = Paths
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
// .toFile()
// .getAbsolutePath();
final String entitiesPath = "/Users/miconis/Desktop/pub_to_fix1.json";
final String dedupConfPath = "/Users/miconis/Desktop/pub.conf.json";
final String dedupConfPath = Paths
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
.toFile()
.getAbsolutePath();
final static int MAX_ACCEPTANCE_DATE = 20;

View File

@ -5,7 +5,7 @@
"entityType" : "organization",
"subEntityValue": "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"queueMaxSize" : "100000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
@ -24,14 +24,18 @@
"start": {
"fields": [
{
"field": "gridid",
"comparator": "exactMatch",
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
"params": {
"mode": "percentage",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1,
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
@ -49,7 +53,7 @@
},
{
"field": "country",
"comparator": "exactMatch",
"comparator": "countryMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
@ -144,18 +148,18 @@
}
},
"model" : [
{ "name" : "pid", "type": "JSON", "path" : "$.pid[*]"},
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -164,7 +168,7 @@
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],