1
0
Fork 0

moved class CleaningFunctions in the correct package. Remove newlines from titles, descriptions, subjects

This commit is contained in:
Claudio Atzori 2020-11-24 18:34:03 +01:00
parent 33bae02451
commit e1a1bb3ee4
7 changed files with 43 additions and 20 deletions

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.clean;
package eu.dnetlib.dhp.schema.oaf;
import java.util.*;
import java.util.function.Function;
@ -10,12 +10,12 @@ import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NEWLINES = "(?:\\n|\\r)";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -76,7 +76,7 @@ public class CleaningFunctions {
return value;
}
protected static <T extends Oaf> T fixDefaults(T value) {
public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@ -109,6 +109,29 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::removeNewLines)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
@ -205,6 +228,16 @@ public class CleaningFunctions {
return value;
}
protected static StructuredProperty removeNewLines(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
protected static Field<String> removeNewLines(Field<String> s) {
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
return s;
}
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {

View File

@ -7,11 +7,10 @@ import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;

View File

@ -4,12 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import java.util.Optional;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
public class PidValueComparator implements Comparator<StructuredProperty> {

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.oaf.CleaningFunctions.*;
import java.util.Optional;
@ -26,7 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import static eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions.*;
public class CleanGraphSparkJob {
@ -89,7 +89,7 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> fixDefaults(value), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")

View File

@ -16,9 +16,9 @@ import org.dom4j.Node;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {

View File

@ -12,7 +12,6 @@ import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;

View File

@ -19,10 +19,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -89,7 +86,7 @@ public class CleaningFunctionTest {
.map(p -> p.getQualifier())
.allMatch(q -> pidTerms.contains(q.getClassid())));
Publication p_defaults = CleaningFunctions.fixDefaults(p_out);
Publication p_defaults = CleaningFunctions.cleanup(p_out);
assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid());
assertNull(p_out.getPublisher());