new tests

This commit is contained in:
Miriam Baglioni 2020-11-18 12:18:11 +01:00
parent 07837e51a9
commit 0e407b5f23
1 changed files with 183 additions and 15 deletions

View File

@ -2,12 +2,10 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
@ -15,19 +13,20 @@ import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.CosineDistance;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.hadoop.security.WhitelistBasedResolver;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import com.intuit.fuzzymatcher.component.MatchService;
@ -38,9 +37,9 @@ import com.intuit.fuzzymatcher.domain.Match;
import com.wcohen.ss.Levenstein;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.*;
import jdk.nashorn.internal.ir.annotations.Ignore;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolver;
import eu.dnetlib.dhp.oa.graph.clean.authorpids.constraints.ConstraintResolverFactory;
import me.xdrop.fuzzywuzzy.FuzzySearch;
import net.sf.saxon.trans.Maker;
import scala.Tuple2;
public class CleanOrcidTest {
@ -52,6 +51,7 @@ public class CleanOrcidTest {
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(CleanOrcidTest.class);
final String whitelist = "{\"whitelist\":[{\"criteria\":[{\"verb\":\"shorterorequal\",\"field\":\"oname\",\"value\":\"2\"},{\"verb\":\"shorterorequal\",\"field\":\"osurname\",\"value\":\"2\"}]},{\"criteria\":[{\"verb\":\"shorterorequal\", \"field\":\"name\", \"value\":\"2\"},{\"verb\":\"shorterorequal\", \"field\":\"surname\", \"value\":\"2\"}]}, {\"criteria\":[{\"verb\":\"equals\", \"field\":\"oname\", \"value\":\"Given Names Deactivated\"},{\"verb\":\"equals\", \"field\":\"osurname\", \"value\":\"Family Name Deactivated\"}]}]}";
// needed for fuzzywuzzy to get a lower bound ratio under which the authors are most probably different
String[][] wrong = {
@ -149,6 +149,12 @@ public class CleanOrcidTest {
},
{
"20", MakeReportSparkJob.handleNameSurname("Adelle Craig")
},
{
"21", MakeReportSparkJob.handleNameSurname("Ramziddin M")
},
{
"20", MakeReportSparkJob.handleNameSurname("R. Mansurov")
}
};
@ -181,6 +187,118 @@ public class CleanOrcidTest {
spark.stop();
}
@Test
public void loadBlackList() {
WhiteList loadedWhiteList = new Gson().fromJson(whitelist, WhiteList.class);
ConstraintResolver resolver = ConstraintResolverFactory.newInstance();
loadedWhiteList.getWhitelist().forEach(c -> {
try {
c.setSelection(resolver);
} catch (InvocationTargetException e) {
e.printStackTrace();
} catch (NoSuchMethodException e) {
e.printStackTrace();
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
});
Map<String, String> param = new HashMap<>();
param.put("oname", "Miriam");
param.put("name", "Miriam");
param.put("osurname", "Miriam");
param.put("surname", "Miriam");
loadedWhiteList.getWhitelist().forEach(c -> Assertions.assertFalse(c.verifyCriteria(param)));
param.put("oname", "P");
param.put("osurname", "tj");
Assertions
.assertEquals(
1, loadedWhiteList
.getWhitelist()
.stream()
.map(c -> c.verifyCriteria(param))
.filter(Boolean::valueOf)
.collect(Collectors.toList())
.size());
param.put("oname", "Given Names Deactivated");
param.put("osurname", "Family Name Deactivated");
Assertions
.assertEquals(
1, loadedWhiteList
.getWhitelist()
.stream()
.map(c -> c.verifyCriteria(param))
.filter(Boolean::valueOf)
.collect(Collectors.toList())
.size());
param.put("name", "P");
param.put("surname", "tj");
Assertions
.assertEquals(
2, loadedWhiteList
.getWhitelist()
.stream()
.map(c -> c.verifyCriteria(param))
.filter(Boolean::valueOf)
.collect(Collectors.toList())
.size());
param.put("oname", "Given Names Deactivated");
param.put("osurname", "Family Name Deactivated");
Assertions
.assertEquals(
2, loadedWhiteList
.getWhitelist()
.stream()
.map(c -> c.verifyCriteria(param))
.filter(Boolean::valueOf)
.collect(Collectors.toList())
.size());
//
// Assertions
// .assertEquals(
// 0, loadedWhiteList
// .getWhitelist()
// .stream()
// .map(c -> c.verifyCriteria("Family Names Deactivated"))
// .filter(v -> v > 1)
// .collect(Collectors.toList())
// .size());
//
// Assertions
// .assertEquals(
// 1, loadedWhiteList
// .getWhitelist()
// .stream()
// .map(c -> c.verifyCriteria("Family Name Deactivated"))
// .filter(v -> v > 1)
// .collect(Collectors.toList())
// .size());
//
// Assertions
// .assertEquals(
// 1, loadedWhiteList
// .getWhitelist()
// .stream()
// .map(c -> c.verifyCriteria("Given Names Deactivated"))
// .filter(v -> v > 1)
// .collect(Collectors.toList())
// .size());
}
@Test
public void loadOrcid() {
@ -201,11 +319,30 @@ public class CleanOrcidTest {
}
@Test
public void serializeConstraint() throws JsonProcessingException {
WhiteList whiteList = new WhiteList();
SelectionConstraints sc = new SelectionConstraints();
Constraints c = new Constraints();
c.setVerb("verb");
c.setValue("value");
c.setField("field");
sc.setCriteria(Arrays.asList(c, c));
whiteList.setWhitelist(Arrays.asList(sc));
System.out.println(OBJECT_MAPPER.writeValueAsString(whiteList));
}
@Test
public void makeReportTest() throws Exception {
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000.gz")
.getPath();
final String inputPath = "";
// getClass()
// .getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000.gz")
// .getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/part-00000-prepared.json.gz")
@ -221,7 +358,8 @@ public class CleanOrcidTest {
"-inputPath", inputPath,
"-preparedInfoPath", preparedInfoPath,
"-orcidInputPath", orcidInputPath,
"-graphTableClassName", "eu.dnetlib.dhp.schema.oaf.Publication"
"-graphTableClassName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-whitelist", whitelist
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -311,7 +449,7 @@ public class CleanOrcidTest {
}
@Test
public void testMakeReport() {
public void testMakeReport() throws IOException {
ResultInfo ri = new ResultInfo();
ri.setName("Prasanth");
ri.setSurname("Manohar");
@ -321,9 +459,39 @@ public class CleanOrcidTest {
oa.setSurname("Ramesh");
Tuple2<ResultInfo, OrcidAuthotitative> t2 = new Tuple2<ResultInfo, OrcidAuthotitative>(ri, oa);
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoFuzzyTuple2(t2);
Tuple2<String, ReportInfo> tmp = MakeReportSparkJob.getStringReportInfoTuple2(t2);
System.out.println(new Gson().toJson(tmp._2(), ReportInfo.class));
ri.setName("Sophia");
ri.setSurname("Hooper");
oa.setName("Man");
oa.setSurname("Yang");
oa.setCreditName("Man Yang (previous known as Sophia Yang Hooper)");
WhiteList wl = new Gson().fromJson(whitelist, WhiteList.class);
ConstraintResolver resolver = ConstraintResolverFactory.newInstance();
wl.getWhitelist().forEach(c -> {
try {
c.setSelection(resolver);
} catch (InvocationTargetException e) {
e.printStackTrace();
} catch (NoSuchMethodException e) {
e.printStackTrace();
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
});
System.out
.println(
OBJECT_MAPPER
.writeValueAsString(
MakeReportSparkJob.getStringReportInfoFuzzyTuple2(new Tuple2<>(ri, oa), wl)._2()));
}
@Test