graph cleaning, suggestions from ticket 8898 - round 2 #356

Merged
claudio.atzori merged 3 commits from cleaning_8898 into beta 2023-11-22 14:00:38 +01:00
3 changed files with 103 additions and 9 deletions
Showing only changes of commit 262d7c581b - Show all commits

View File

@ -3,16 +3,20 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException; import java.time.format.DateTimeParseException;
import java.util.*; import java.util.*;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.cxf.common.util.UrlUtils;
import com.github.sisyphsu.dateparser.DateParserUtils; import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
@ -23,6 +27,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode; import me.xuender.unidecode.Unidecode;
import sun.awt.HKSCS;
public class GraphCleaningFunctions extends CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
@ -36,6 +41,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+"; private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+";
private static final Set<String> INVALID_AUTHOR_NAMES = new HashSet<>();
private static final Set<String> INVALID_URLS = new HashSet<>();
private static final Set<String> INVALID_URL_HOSTS = new HashSet<>();
private static final HashSet<String> PEER_REVIEWED_TYPES = new HashSet<>(); private static final HashSet<String> PEER_REVIEWED_TYPES = new HashSet<>();
static { static {
@ -48,6 +60,47 @@ public class GraphCleaningFunctions extends CleaningFunctions {
PEER_REVIEWED_TYPES.add("Thesis"); PEER_REVIEWED_TYPES.add("Thesis");
PEER_REVIEWED_TYPES.add("Bachelor thesis"); PEER_REVIEWED_TYPES.add("Bachelor thesis");
PEER_REVIEWED_TYPES.add("Conference object"); PEER_REVIEWED_TYPES.add("Conference object");
INVALID_AUTHOR_NAMES.add("(:null)");
INVALID_AUTHOR_NAMES.add("(:unap)");
INVALID_AUTHOR_NAMES.add("(:tba)");
INVALID_AUTHOR_NAMES.add("(:unas)");
INVALID_AUTHOR_NAMES.add("(:unav)");
INVALID_AUTHOR_NAMES.add("(:unkn)");
INVALID_AUTHOR_NAMES.add("(:unkn) unknown");
INVALID_AUTHOR_NAMES.add(":none");
INVALID_AUTHOR_NAMES.add(":null");
INVALID_AUTHOR_NAMES.add(":unas");
INVALID_AUTHOR_NAMES.add(":unav");
INVALID_AUTHOR_NAMES.add(":unkn");
INVALID_AUTHOR_NAMES.add("[autor desconocido]");
INVALID_AUTHOR_NAMES.add("[s. n.]");
INVALID_AUTHOR_NAMES.add("[s.n]");
INVALID_AUTHOR_NAMES.add("[unknown]");
INVALID_AUTHOR_NAMES.add("anonymous");
INVALID_AUTHOR_NAMES.add("n.n.");
INVALID_AUTHOR_NAMES.add("nn");
INVALID_AUTHOR_NAMES.add("no name supplied");
INVALID_AUTHOR_NAMES.add("none");
INVALID_AUTHOR_NAMES.add("none available");
INVALID_AUTHOR_NAMES.add("not available not available");
INVALID_AUTHOR_NAMES.add("null &na;");
INVALID_AUTHOR_NAMES.add("null anonymous");
INVALID_AUTHOR_NAMES.add("unbekannt");
INVALID_AUTHOR_NAMES.add("unknown");
INVALID_URL_HOSTS.add("creativecommons.org");
INVALID_URL_HOSTS.add("www.academia.edu");
INVALID_URL_HOSTS.add("academia.edu");
INVALID_URL_HOSTS.add("researchgate.net");
INVALID_URL_HOSTS.add("www.researchgate.net");
INVALID_URLS.add("http://repo.scoap3.org/api");
INVALID_URLS.add("http://ora.ox.ac.uk/objects/uuid:");
INVALID_URLS.add("http://ntur.lib.ntu.edu.tw/news/agent_contract.pdf");
INVALID_URLS.add("https://media.springer.com/full/springer-instructions-for-authors-assets/pdf/SN_BPF_EN.pdf");
INVALID_URLS.add("http://www.tobaccoinduceddiseases.org/dl/61aad426c96519bea4040a374c6a6110/");
INVALID_URLS.add("https://www.bilboard.nl/verenigingsbladen/bestuurskundige-berichten");
} }
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) { public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
@ -558,6 +611,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
i.setFulltext(null); i.setFulltext(null);
} }
if (Objects.nonNull(i.getUrl())) {
i
.setUrl(
i
.getUrl()
.stream()
.filter(GraphCleaningFunctions::urlFilter)
.collect(Collectors.toList()));
}
} }
} }
if (Objects.isNull(r.getBestaccessright()) if (Objects.isNull(r.getBestaccessright())
@ -580,8 +642,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.getAuthor() .getAuthor()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(a -> StringUtils.isNotBlank(a.getFullname())) .filter(GraphCleaningFunctions::isValidAuthorName)
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.map(GraphCleaningFunctions::cleanupAuthor) .map(GraphCleaningFunctions::cleanupAuthor)
.collect(Collectors.toList())); .collect(Collectors.toList()));
@ -739,14 +800,32 @@ public class GraphCleaningFunctions extends CleaningFunctions {
// HELPERS // HELPERS
private static boolean isValidAuthorName(Author a) { private static boolean isValidAuthorName(Author a) {
return !Stream return StringUtils.isNotBlank(a.getFullname()) &&
StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")) &&
!INVALID_AUTHOR_NAMES.contains(StringUtils.lowerCase(a.getFullname()).trim()) &&
!Stream
.of(a.getFullname(), a.getName(), a.getSurname()) .of(a.getFullname(), a.getName(), a.getSurname())
.filter(s -> s != null && !s.isEmpty()) .filter(StringUtils::isNotBlank)
.collect(Collectors.joining("")) .collect(Collectors.joining(""))
.toLowerCase() .toLowerCase()
.matches(INVALID_AUTHOR_REGEX); .matches(INVALID_AUTHOR_REGEX);
} }
private static boolean urlFilter(String u) {
try {
final URL url = new URL(u);
if (StringUtils.isBlank(url.getPath()) || "/".equals(url.getPath())) {
return false;
}
if (INVALID_URL_HOSTS.contains(url.getHost())) {
return false;
}
return !INVALID_URLS.contains(url.toString());
} catch (MalformedURLException ex) {
return false;
}
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) { private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids return pids
.stream() .stream()

View File

@ -251,9 +251,19 @@ public class CleanGraphSparkJobTest {
.filter(String.format("id = '%s'", id)) .filter(String.format("id = '%s'", id))
.first(); .first();
final Set<String> invalidURLs = new HashSet<>();
invalidURLs.add("http://academia.edu/abcd");
invalidURLs.add("http://repo.scoap3.org/api");
invalidURLs.add("http://hdl.handle.net/");
assertNull(p_in.getBestaccessright()); assertNull(p_in.getBestaccessright());
assertTrue(p_in instanceof Result); assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication); assertTrue(p_in instanceof Publication);
assertNotNull(p_in.getAuthor());
assertEquals(14, p_in.getAuthor().size());
assertNotNull(p_in.getInstance());
assertNotNull(p_in.getInstance().get(0));
assertEquals(3, p_in.getInstance().get(0).getUrl().stream().filter(invalidURLs::contains).count());
new CleanGraphSparkJob( new CleanGraphSparkJob(
args( args(
@ -273,6 +283,9 @@ public class CleanGraphSparkJobTest {
assertNull(p.getPublisher()); assertNull(p.getPublisher());
assertNotNull(p.getAuthor());
assertEquals(12, p.getAuthor().size());
assertEquals("und", p.getLanguage().getClassid()); assertEquals("und", p.getLanguage().getClassid());
assertEquals("Undetermined", p.getLanguage().getClassname()); assertEquals("Undetermined", p.getLanguage().getClassname());
@ -364,6 +377,8 @@ public class CleanGraphSparkJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); .anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
assertTrue(p.getInstance().get(0).getUrl().stream().noneMatch(invalidURLs::contains));
assertNotNull(p.getSubject()); assertNotNull(p.getSubject());
List<Subject> fos_subjects = p List<Subject> fos_subjects = p