minor changes
This commit is contained in:
parent
9b3a9b2381
commit
9f66e426db
|
@ -17,45 +17,6 @@ import eu.dnetlib.pace.tree.support.TreeStats;
|
|||
|
||||
class DecisionTreeTest {
|
||||
|
||||
@Test
|
||||
void testJPath() throws IOException {
|
||||
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
|
||||
System.out.println("row = " + row.getAs("countrytitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
// to check that the same parsing returns the same row
|
||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
Assertions.assertEquals(row, row1);
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void organizationDecisionTreeTest() throws Exception {
|
||||
DedupConfig conf = DedupConfig
|
||||
|
|
|
@ -15,6 +15,28 @@ import eu.dnetlib.pace.model.SparkModel;
|
|||
|
||||
class JsonPathTest {
|
||||
|
||||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
// to check that the same parsing returns the same row
|
||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
Assertions.assertEquals(row, row1);
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testJPath() throws IOException {
|
||||
|
||||
|
@ -29,29 +51,7 @@ class JsonPathTest {
|
|||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
|
||||
System.out.println("row = " + row.getAs("countrytitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
// to check that the same parsing returns the same row
|
||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
Assertions.assertEquals(row, row1);
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
System.out.println("row = " + row.getAs("country"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -49,18 +49,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
static DedupConfig config;
|
||||
static JavaSparkContext context;
|
||||
|
||||
// final String entitiesPath = Paths
|
||||
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
|
||||
// .toFile()
|
||||
// .getAbsolutePath();
|
||||
final String entitiesPath = Paths
|
||||
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
// final String dedupConfPath = Paths
|
||||
// .get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
|
||||
// .toFile()
|
||||
// .getAbsolutePath();
|
||||
|
||||
final String entitiesPath = "/Users/miconis/Desktop/pub_to_fix1.json";
|
||||
final String dedupConfPath = "/Users/miconis/Desktop/pub.conf.json";
|
||||
final String dedupConfPath = Paths
|
||||
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
final static int MAX_ACCEPTANCE_DATE = 20;
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"entityType" : "organization",
|
||||
"subEntityValue": "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"queueMaxSize" : "100000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"idPath":"$.id",
|
||||
|
@ -24,14 +24,18 @@
|
|||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "gridid",
|
||||
"comparator": "exactMatch",
|
||||
"field": "pid",
|
||||
"comparator": "jsonListMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
"params": {
|
||||
"mode": "percentage",
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
|
@ -49,7 +53,7 @@
|
|||
},
|
||||
{
|
||||
"field": "country",
|
||||
"comparator": "exactMatch",
|
||||
"comparator": "countryMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
|
@ -144,18 +148,18 @@
|
|||
}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "pid", "type": "JSON", "path" : "$.pid[*]"},
|
||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
|
@ -164,7 +168,7 @@
|
|||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
|
||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue