forked from antonis.lempesis/dnet-hadoop
introduced PidBlacklist
This commit is contained in:
parent
893ac4a77b
commit
943b961cf6
|
@ -1,13 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
|
@ -48,7 +47,8 @@ public class IdentifierFactory implements Serializable {
|
|||
Map<String, List<StructuredProperty>> pids = entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(s -> pidFilter(s))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.filter(IdentifierFactory::pidFilter)
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
|
@ -83,17 +83,21 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
|
||||
protected static boolean pidFilter(StructuredProperty s) {
|
||||
final String pidValue = s.getValue();
|
||||
if (Objects.isNull(s.getQualifier()) ||
|
||||
StringUtils.isBlank(s.getValue()) ||
|
||||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||
StringUtils.isBlank(pidValue) ||
|
||||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||
return false;
|
||||
}
|
||||
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) {
|
||||
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
||||
return false;
|
||||
}
|
||||
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
||||
return false;
|
||||
}
|
||||
switch (PidType.tryValueOf(s.getQualifier().getClassid())) {
|
||||
case doi:
|
||||
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
|
||||
final String doi = StringUtils.trim(StringUtils.lowerCase(pidValue));
|
||||
return doi.matches(DOI_REGEX);
|
||||
case original:
|
||||
return false;
|
||||
|
@ -103,13 +107,12 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
|
||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
||||
final String value = CleaningFunctions.normalizePidValue(s).getValue();
|
||||
return new StringBuilder()
|
||||
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||
.append(ID_PREFIX_SEPARATOR)
|
||||
.append(createPrefix(s.getQualifier().getClassid()))
|
||||
.append(ID_SEPARATOR)
|
||||
.append(md5 ? DHPUtils.md5(value) : value)
|
||||
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
|
||||
.toString();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,2 +1,8 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklist {
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class PidBlacklist extends HashMap<String, HashSet<String>> {
|
||||
}
|
||||
|
|
|
@ -1,2 +1,37 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklistProvider {
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class PidBlacklistProvider {
|
||||
|
||||
private static final PidBlacklist blacklist;
|
||||
|
||||
static {
|
||||
try {
|
||||
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
|
||||
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static PidBlacklist getBlacklist() {
|
||||
return blacklist;
|
||||
}
|
||||
|
||||
public static Set<String> getBlacklist(String pidType) {
|
||||
return Optional
|
||||
.ofNullable(getBlacklist().get(pidType))
|
||||
.orElse(new HashSet<>());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,2 +1,17 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;public class BlackListProviderTest {
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class BlackListProviderTest {
|
||||
|
||||
@Test
|
||||
public void blackListTest() {
|
||||
|
||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
|
||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
|
||||
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
|
||||
Assertions.assertNull(PidBlacklistProvider.getBlacklist("xxx"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[ {"qualifier":{"classid":"doi"},"value":"10.12739/10.12739"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
Loading…
Reference in New Issue