forked from D-Net/dnet-hadoop
new tests
This commit is contained in:
parent
07837e51a9
commit
0e407b5f23
|
@ -2,12 +2,10 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -15,19 +13,20 @@ import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.text.similarity.CosineDistance;
|
import org.apache.commons.text.similarity.CosineDistance;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
|
import org.apache.hadoop.security.WhitelistBasedResolver;
|
||||||
|
import org.apache.neethi.Assertion;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.intuit.fuzzymatcher.component.MatchService;
|
import com.intuit.fuzzymatcher.component.MatchService;
|
||||||
|
@ -38,9 +37,9 @@ import com.intuit.fuzzymatcher.domain.Match;
|
||||||
import com.wcohen.ss.Levenstein;
|
import com.wcohen.ss.Levenstein;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
|
import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolver;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolverFactory;
|
||||||
import me.xdrop.fuzzywuzzy.FuzzySearch;
|
import me.xdrop.fuzzywuzzy.FuzzySearch;
|
||||||
import net.sf.saxon.trans.Maker;
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class CleanOrcidTest {
|
public class CleanOrcidTest {
|
||||||
|
@ -52,6 +51,7 @@ public class CleanOrcidTest {
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
|
private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
|
||||||
|
final String whitelist = "{\"whitelist\":[{\"criteria\":[{\"verb\":\"shorterorequal\",\"field\":\"oname\",\"value\":\"2\"},{\"verb\":\"shorterorequal\",\"field\":\"osurname\",\"value\":\"2\"}]},{\"criteria\":[{\"verb\":\"shorterorequal\", \"field\":\"name\", \"value\":\"2\"},{\"verb\":\"shorterorequal\", \"field\":\"surname\", \"value\":\"2\"}]}, {\"criteria\":[{\"verb\":\"equals\", \"field\":\"oname\", \"value\":\"Given Names Deactivated\"},{\"verb\":\"equals\", \"field\":\"osurname\", \"value\":\"Family Name Deactivated\"}]}]}";
|
||||||
|
|
||||||
// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
|
// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
|
||||||
String[][] wrong = {
|
String[][] wrong = {
|
||||||
|
@ -149,6 +149,12 @@ public class CleanOrcidTest {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
|
"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"21", MakeReportSparkJob.handleNameSurname("Ramziddin M")
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"20", MakeReportSparkJob.handleNameSurname("R. Mansurov")
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -181,6 +187,118 @@ public class CleanOrcidTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void loadBlackList() {
|
||||||
|
|
||||||
|
WhiteList loadedWhiteList = new Gson().fromJson(whitelist, WhiteList.class);
|
||||||
|
ConstraintResolver resolver = ConstraintResolverFactory.newInstance();
|
||||||
|
|
||||||
|
loadedWhiteList.getWhitelist().forEach(c -> {
|
||||||
|
try {
|
||||||
|
c.setSelection(resolver);
|
||||||
|
} catch (InvocationTargetException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (InstantiationException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Map<String, String> param = new HashMap<>();
|
||||||
|
param.put("oname", "Miriam");
|
||||||
|
param.put("name", "Miriam");
|
||||||
|
param.put("osurname", "Miriam");
|
||||||
|
param.put("surname", "Miriam");
|
||||||
|
loadedWhiteList.getWhitelist().forEach(c -> Assertions.assertFalse(c.verifyCriteria(param)));
|
||||||
|
|
||||||
|
param.put("oname", "P");
|
||||||
|
param.put("osurname", "tj");
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, loadedWhiteList
|
||||||
|
.getWhitelist()
|
||||||
|
.stream()
|
||||||
|
.map(c -> c.verifyCriteria(param))
|
||||||
|
.filter(Boolean::valueOf)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.size());
|
||||||
|
|
||||||
|
param.put("oname", "Given Names Deactivated");
|
||||||
|
param.put("osurname", "Family Name Deactivated");
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, loadedWhiteList
|
||||||
|
.getWhitelist()
|
||||||
|
.stream()
|
||||||
|
.map(c -> c.verifyCriteria(param))
|
||||||
|
.filter(Boolean::valueOf)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.size());
|
||||||
|
|
||||||
|
param.put("name", "P");
|
||||||
|
param.put("surname", "tj");
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
2, loadedWhiteList
|
||||||
|
.getWhitelist()
|
||||||
|
.stream()
|
||||||
|
.map(c -> c.verifyCriteria(param))
|
||||||
|
.filter(Boolean::valueOf)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.size());
|
||||||
|
|
||||||
|
param.put("oname", "Given Names Deactivated");
|
||||||
|
param.put("osurname", "Family Name Deactivated");
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
2, loadedWhiteList
|
||||||
|
.getWhitelist()
|
||||||
|
.stream()
|
||||||
|
.map(c -> c.verifyCriteria(param))
|
||||||
|
.filter(Boolean::valueOf)
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.size());
|
||||||
|
|
||||||
|
//
|
||||||
|
// Assertions
|
||||||
|
// .assertEquals(
|
||||||
|
// 0, loadedWhiteList
|
||||||
|
// .getWhitelist()
|
||||||
|
// .stream()
|
||||||
|
// .map(c -> c.verifyCriteria("Family Names Deactivated"))
|
||||||
|
// .filter(v -> v > 1)
|
||||||
|
// .collect(Collectors.toList())
|
||||||
|
// .size());
|
||||||
|
//
|
||||||
|
// Assertions
|
||||||
|
// .assertEquals(
|
||||||
|
// 1, loadedWhiteList
|
||||||
|
// .getWhitelist()
|
||||||
|
// .stream()
|
||||||
|
// .map(c -> c.verifyCriteria("Family Name Deactivated"))
|
||||||
|
// .filter(v -> v > 1)
|
||||||
|
// .collect(Collectors.toList())
|
||||||
|
// .size());
|
||||||
|
//
|
||||||
|
// Assertions
|
||||||
|
// .assertEquals(
|
||||||
|
// 1, loadedWhiteList
|
||||||
|
// .getWhitelist()
|
||||||
|
// .stream()
|
||||||
|
// .map(c -> c.verifyCriteria("Given Names Deactivated"))
|
||||||
|
// .filter(v -> v > 1)
|
||||||
|
// .collect(Collectors.toList())
|
||||||
|
// .size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void loadOrcid() {
|
public void loadOrcid() {
|
||||||
|
|
||||||
|
@ -201,11 +319,30 @@ public class CleanOrcidTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void serializeConstraint() throws JsonProcessingException {
|
||||||
|
WhiteList whiteList = new WhiteList();
|
||||||
|
|
||||||
|
SelectionConstraints sc = new SelectionConstraints();
|
||||||
|
|
||||||
|
Constraints c = new Constraints();
|
||||||
|
c.setVerb("verb");
|
||||||
|
c.setValue("value");
|
||||||
|
c.setField("field");
|
||||||
|
|
||||||
|
sc.setCriteria(Arrays.asList(c, c));
|
||||||
|
|
||||||
|
whiteList.setWhitelist(Arrays.asList(sc));
|
||||||
|
|
||||||
|
System.out.println(OBJECT_MAPPER.writeValueAsString(whiteList));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void makeReportTest() throws Exception {
|
public void makeReportTest() throws Exception {
|
||||||
final String inputPath = getClass()
|
final String inputPath = "";
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000.gz")
|
// getClass()
|
||||||
.getPath();
|
// .getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000.gz")
|
||||||
|
// .getPath();
|
||||||
|
|
||||||
final String preparedInfoPath = getClass()
|
final String preparedInfoPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000-prepared.json.gz")
|
.getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000-prepared.json.gz")
|
||||||
|
@ -221,7 +358,8 @@ public class CleanOrcidTest {
|
||||||
"-inputPath", inputPath,
|
"-inputPath", inputPath,
|
||||||
"-preparedInfoPath", preparedInfoPath,
|
"-preparedInfoPath", preparedInfoPath,
|
||||||
"-orcidInputPath", orcidInputPath,
|
"-orcidInputPath", orcidInputPath,
|
||||||
"-graphTableClassName", "eu.dnetlib.dhp.schema.oaf.Publication"
|
"-graphTableClassName", "eu.dnetlib.dhp.schema.oaf.Publication",
|
||||||
|
"-whitelist", whitelist
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
@ -311,7 +449,7 @@ public class CleanOrcidTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMakeReport() {
|
public void testMakeReport() throws IOException {
|
||||||
ResultInfo ri = new ResultInfo();
|
ResultInfo ri = new ResultInfo();
|
||||||
ri.setName("Prasanth");
|
ri.setName("Prasanth");
|
||||||
ri.setSurname("Manohar");
|
ri.setSurname("Manohar");
|
||||||
|
@ -321,9 +459,39 @@ public class CleanOrcidTest {
|
||||||
oa.setSurname("Ramesh");
|
oa.setSurname("Ramesh");
|
||||||
|
|
||||||
Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
|
Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
|
||||||
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
|
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoTuple2(t2);
|
||||||
|
|
||||||
System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
|
System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
|
||||||
|
|
||||||
|
ri.setName("Sophia");
|
||||||
|
ri.setSurname("Hooper");
|
||||||
|
|
||||||
|
oa.setName("Man");
|
||||||
|
oa.setSurname("Yang");
|
||||||
|
oa.setCreditName("Man Yang (previous known as Sophia Yang Hooper)");
|
||||||
|
|
||||||
|
WhiteList wl = new Gson().fromJson(whitelist, WhiteList.class);
|
||||||
|
ConstraintResolver resolver = ConstraintResolverFactory.newInstance();
|
||||||
|
|
||||||
|
wl.getWhitelist().forEach(c -> {
|
||||||
|
try {
|
||||||
|
c.setSelection(resolver);
|
||||||
|
} catch (InvocationTargetException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (NoSuchMethodException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (InstantiationException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
OBJECT_MAPPER
|
||||||
|
.writeValueAsString(
|
||||||
|
MakeReportSparkJob.getStringReportInfoFuzzyTuple2(new Tuple2<>(ri, oa), wl)._2()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue