minor changes
This commit is contained in:
parent
9b3a9b2381
commit
9f66e426db
|
@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
|
||||||
|
|
||||||
class DecisionTreeTest {
|
class DecisionTreeTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
void testJPath() throws IOException {
|
|
||||||
|
|
||||||
DedupConfig conf = DedupConfig
|
|
||||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
|
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
|
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
|
|
||||||
System.out.println("row = " + row);
|
|
||||||
Assertions.assertNotNull(row);
|
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
|
||||||
|
|
||||||
System.out.println("row = " + row.getAs("countrytitle"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void jsonToModelTest() throws IOException {
|
|
||||||
DedupConfig conf = DedupConfig
|
|
||||||
.load(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkOpenorgsDedupTest.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
// to check that the same parsing returns the same row
|
|
||||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
|
|
||||||
Assertions.assertEquals(row, row1);
|
|
||||||
System.out.println("row = " + row);
|
|
||||||
Assertions.assertNotNull(row);
|
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void organizationDecisionTreeTest() throws Exception {
|
void organizationDecisionTreeTest() throws Exception {
|
||||||
DedupConfig conf = DedupConfig
|
DedupConfig conf = DedupConfig
|
||||||
|
|
|
@ -15,6 +15,28 @@ import eu.dnetlib.pace.model.SparkModel;
|
||||||
|
|
||||||
class JsonPathTest {
|
class JsonPathTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void jsonToModelTest() throws IOException {
|
||||||
|
DedupConfig conf = DedupConfig
|
||||||
|
.load(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkOpenorgsDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
|
||||||
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||||
|
|
||||||
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
// to check that the same parsing returns the same row
|
||||||
|
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
Assertions.assertEquals(row, row1);
|
||||||
|
System.out.println("row = " + row);
|
||||||
|
Assertions.assertNotNull(row);
|
||||||
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testJPath() throws IOException {
|
void testJPath() throws IOException {
|
||||||
|
|
||||||
|
@ -29,29 +51,7 @@ class JsonPathTest {
|
||||||
Assertions.assertNotNull(row);
|
Assertions.assertNotNull(row);
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
|
||||||
System.out.println("row = " + row.getAs("countrytitle"));
|
System.out.println("row = " + row.getAs("country"));
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void jsonToModelTest() throws IOException {
|
|
||||||
DedupConfig conf = DedupConfig
|
|
||||||
.load(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkOpenorgsDedupTest.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
// to check that the same parsing returns the same row
|
|
||||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
|
||||||
|
|
||||||
Assertions.assertEquals(row, row1);
|
|
||||||
System.out.println("row = " + row);
|
|
||||||
Assertions.assertNotNull(row);
|
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -49,18 +49,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
||||||
static DedupConfig config;
|
static DedupConfig config;
|
||||||
static JavaSparkContext context;
|
static JavaSparkContext context;
|
||||||
|
|
||||||
// final String entitiesPath = Paths
|
final String entitiesPath = Paths
|
||||||
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
|
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
|
||||||
// .toFile()
|
.toFile()
|
||||||
// .getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
// final String dedupConfPath = Paths
|
final String dedupConfPath = Paths
|
||||||
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
|
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
|
||||||
// .toFile()
|
.toFile()
|
||||||
// .getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
final String entitiesPath = "/Users/miconis/Desktop/pub_to_fix1.json";
|
|
||||||
final String dedupConfPath = "/Users/miconis/Desktop/pub.conf.json";
|
|
||||||
|
|
||||||
final static int MAX_ACCEPTANCE_DATE = 20;
|
final static int MAX_ACCEPTANCE_DATE = 20;
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"subEntityValue": "organization",
|
"subEntityValue": "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "legalname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "100000",
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"idPath":"$.id",
|
"idPath":"$.id",
|
||||||
|
@ -24,14 +24,18 @@
|
||||||
"start": {
|
"start": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "gridid",
|
"field": "pid",
|
||||||
"comparator": "exactMatch",
|
"comparator": "jsonListMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {
|
||||||
|
"mode": "percentage",
|
||||||
|
"jpath_value": "$.value",
|
||||||
|
"jpath_classid": "$.qualifier.classid"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1,
|
"threshold": 0.5,
|
||||||
"aggregation": "AVG",
|
"aggregation": "AVG",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
|
@ -49,7 +53,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "country",
|
"field": "country",
|
||||||
"comparator": "exactMatch",
|
"comparator": "countryMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
|
@ -144,18 +148,18 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model" : [
|
"model" : [
|
||||||
|
{ "name" : "pid", "type": "JSON", "path" : "$.pid[*]"},
|
||||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
||||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
||||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
|
||||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {
|
||||||
"legalname" : []
|
"legalname" : []
|
||||||
},
|
},
|
||||||
"synonyms": {
|
"synonyms": {
|
||||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
|
||||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||||
|
@ -164,7 +168,7 @@
|
||||||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
||||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
|
||||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue