graph cleaning, suggestions from ticket 8898 - round 2 #356
|
@ -3,16 +3,20 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.time.format.DateTimeParseException;
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
import java.util.function.Supplier;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.cxf.common.util.UrlUtils;
|
||||||
|
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
@ -23,6 +27,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import me.xuender.unidecode.Unidecode;
|
import me.xuender.unidecode.Unidecode;
|
||||||
|
import sun.awt.HKSCS;
|
||||||
|
|
||||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
|
@ -36,6 +41,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||||
private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+";
|
private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+";
|
||||||
|
|
||||||
|
private static final Set<String> INVALID_AUTHOR_NAMES = new HashSet<>();
|
||||||
|
|
||||||
|
private static final Set<String> INVALID_URLS = new HashSet<>();
|
||||||
|
|
||||||
|
private static final Set<String> INVALID_URL_HOSTS = new HashSet<>();
|
||||||
|
|
||||||
private static final HashSet<String> PEER_REVIEWED_TYPES = new HashSet<>();
|
private static final HashSet<String> PEER_REVIEWED_TYPES = new HashSet<>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -48,6 +60,47 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
PEER_REVIEWED_TYPES.add("Thesis");
|
PEER_REVIEWED_TYPES.add("Thesis");
|
||||||
PEER_REVIEWED_TYPES.add("Bachelor thesis");
|
PEER_REVIEWED_TYPES.add("Bachelor thesis");
|
||||||
PEER_REVIEWED_TYPES.add("Conference object");
|
PEER_REVIEWED_TYPES.add("Conference object");
|
||||||
|
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:null)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:unap)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:tba)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:unas)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:unav)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:unkn)");
|
||||||
|
INVALID_AUTHOR_NAMES.add("(:unkn) unknown");
|
||||||
|
INVALID_AUTHOR_NAMES.add(":none");
|
||||||
|
INVALID_AUTHOR_NAMES.add(":null");
|
||||||
|
INVALID_AUTHOR_NAMES.add(":unas");
|
||||||
|
INVALID_AUTHOR_NAMES.add(":unav");
|
||||||
|
INVALID_AUTHOR_NAMES.add(":unkn");
|
||||||
|
INVALID_AUTHOR_NAMES.add("[autor desconocido]");
|
||||||
|
INVALID_AUTHOR_NAMES.add("[s. n.]");
|
||||||
|
INVALID_AUTHOR_NAMES.add("[s.n]");
|
||||||
|
INVALID_AUTHOR_NAMES.add("[unknown]");
|
||||||
|
INVALID_AUTHOR_NAMES.add("anonymous");
|
||||||
|
INVALID_AUTHOR_NAMES.add("n.n.");
|
||||||
|
INVALID_AUTHOR_NAMES.add("nn");
|
||||||
|
INVALID_AUTHOR_NAMES.add("no name supplied");
|
||||||
|
INVALID_AUTHOR_NAMES.add("none");
|
||||||
|
INVALID_AUTHOR_NAMES.add("none available");
|
||||||
|
INVALID_AUTHOR_NAMES.add("not available not available");
|
||||||
|
INVALID_AUTHOR_NAMES.add("null &na;");
|
||||||
|
INVALID_AUTHOR_NAMES.add("null anonymous");
|
||||||
|
INVALID_AUTHOR_NAMES.add("unbekannt");
|
||||||
|
INVALID_AUTHOR_NAMES.add("unknown");
|
||||||
|
|
||||||
|
INVALID_URL_HOSTS.add("creativecommons.org");
|
||||||
|
INVALID_URL_HOSTS.add("www.academia.edu");
|
||||||
|
INVALID_URL_HOSTS.add("academia.edu");
|
||||||
|
INVALID_URL_HOSTS.add("researchgate.net");
|
||||||
|
INVALID_URL_HOSTS.add("www.researchgate.net");
|
||||||
|
|
||||||
|
INVALID_URLS.add("http://repo.scoap3.org/api");
|
||||||
|
INVALID_URLS.add("http://ora.ox.ac.uk/objects/uuid:");
|
||||||
|
INVALID_URLS.add("http://ntur.lib.ntu.edu.tw/news/agent_contract.pdf");
|
||||||
|
INVALID_URLS.add("https://media.springer.com/full/springer-instructions-for-authors-assets/pdf/SN_BPF_EN.pdf");
|
||||||
|
INVALID_URLS.add("http://www.tobaccoinduceddiseases.org/dl/61aad426c96519bea4040a374c6a6110/");
|
||||||
|
INVALID_URLS.add("https://www.bilboard.nl/verenigingsbladen/bestuurskundige-berichten");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
|
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
|
||||||
|
@ -558,6 +611,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
|
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
|
||||||
i.setFulltext(null);
|
i.setFulltext(null);
|
||||||
}
|
}
|
||||||
|
if (Objects.nonNull(i.getUrl())) {
|
||||||
|
i
|
||||||
|
.setUrl(
|
||||||
|
i
|
||||||
|
.getUrl()
|
||||||
|
.stream()
|
||||||
|
.filter(GraphCleaningFunctions::urlFilter)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getBestaccessright())
|
if (Objects.isNull(r.getBestaccessright())
|
||||||
|
@ -580,8 +642,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.getAuthor()
|
.getAuthor()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
|
.filter(GraphCleaningFunctions::isValidAuthorName)
|
||||||
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
|
|
||||||
.map(GraphCleaningFunctions::cleanupAuthor)
|
.map(GraphCleaningFunctions::cleanupAuthor)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
@ -739,14 +800,32 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
// HELPERS
|
// HELPERS
|
||||||
|
|
||||||
private static boolean isValidAuthorName(Author a) {
|
private static boolean isValidAuthorName(Author a) {
|
||||||
return !Stream
|
return StringUtils.isNotBlank(a.getFullname()) &&
|
||||||
|
StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")) &&
|
||||||
|
!INVALID_AUTHOR_NAMES.contains(StringUtils.lowerCase(a.getFullname()).trim()) &&
|
||||||
|
!Stream
|
||||||
.of(a.getFullname(), a.getName(), a.getSurname())
|
.of(a.getFullname(), a.getName(), a.getSurname())
|
||||||
.filter(s -> s != null && !s.isEmpty())
|
.filter(StringUtils::isNotBlank)
|
||||||
.collect(Collectors.joining(""))
|
.collect(Collectors.joining(""))
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
.matches(INVALID_AUTHOR_REGEX);
|
.matches(INVALID_AUTHOR_REGEX);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean urlFilter(String u) {
|
||||||
|
try {
|
||||||
|
final URL url = new URL(u);
|
||||||
|
if (StringUtils.isBlank(url.getPath()) || "/".equals(url.getPath())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (INVALID_URL_HOSTS.contains(url.getHost())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return !INVALID_URLS.contains(url.toString());
|
||||||
|
} catch (MalformedURLException ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||||
return pids
|
return pids
|
||||||
.stream()
|
.stream()
|
||||||
|
|
|
@ -251,9 +251,19 @@ public class CleanGraphSparkJobTest {
|
||||||
.filter(String.format("id = '%s'", id))
|
.filter(String.format("id = '%s'", id))
|
||||||
.first();
|
.first();
|
||||||
|
|
||||||
|
final Set<String> invalidURLs = new HashSet<>();
|
||||||
|
invalidURLs.add("http://academia.edu/abcd");
|
||||||
|
invalidURLs.add("http://repo.scoap3.org/api");
|
||||||
|
invalidURLs.add("http://hdl.handle.net/");
|
||||||
|
|
||||||
assertNull(p_in.getBestaccessright());
|
assertNull(p_in.getBestaccessright());
|
||||||
assertTrue(p_in instanceof Result);
|
assertTrue(p_in instanceof Result);
|
||||||
assertTrue(p_in instanceof Publication);
|
assertTrue(p_in instanceof Publication);
|
||||||
|
assertNotNull(p_in.getAuthor());
|
||||||
|
assertEquals(14, p_in.getAuthor().size());
|
||||||
|
assertNotNull(p_in.getInstance());
|
||||||
|
assertNotNull(p_in.getInstance().get(0));
|
||||||
|
assertEquals(3, p_in.getInstance().get(0).getUrl().stream().filter(invalidURLs::contains).count());
|
||||||
|
|
||||||
new CleanGraphSparkJob(
|
new CleanGraphSparkJob(
|
||||||
args(
|
args(
|
||||||
|
@ -273,6 +283,9 @@ public class CleanGraphSparkJobTest {
|
||||||
|
|
||||||
assertNull(p.getPublisher());
|
assertNull(p.getPublisher());
|
||||||
|
|
||||||
|
assertNotNull(p.getAuthor());
|
||||||
|
assertEquals(12, p.getAuthor().size());
|
||||||
|
|
||||||
assertEquals("und", p.getLanguage().getClassid());
|
assertEquals("und", p.getLanguage().getClassid());
|
||||||
assertEquals("Undetermined", p.getLanguage().getClassname());
|
assertEquals("Undetermined", p.getLanguage().getClassname());
|
||||||
|
|
||||||
|
@ -364,6 +377,8 @@ public class CleanGraphSparkJobTest {
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertTrue(p.getInstance().get(0).getUrl().stream().noneMatch(invalidURLs::contains));
|
||||||
|
|
||||||
assertNotNull(p.getSubject());
|
assertNotNull(p.getSubject());
|
||||||
|
|
||||||
List<Subject> fos_subjects = p
|
List<Subject> fos_subjects = p
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue