graph cleaning, suggestions from ticket 8898 #325

Merged
miriam.baglioni merged 2 commits from cleaning_8898 into beta 2023-08-08 11:14:20 +02:00
4 changed files with 110 additions and 8 deletions

View File

@ -13,11 +13,7 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.sisyphsu.dateparser.DateParserUtils; import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
@ -39,6 +35,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
private static final String NAME_CLEANING_REGEX = "[\\r\\n\\t\\s]+";
public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) { public static <T extends Oaf> T cleanContext(T value, String contextId, String verifyParam) {
if (ModelSupport.isSubClass(value, Result.class)) { if (ModelSupport.isSubClass(value, Result.class)) {
@ -247,7 +244,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to evaluate here // nothing to evaluate here
} else if (value instanceof Project) { } else if (value instanceof Project) {
// nothing to evaluate here final Project p = (Project) value;
return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue());
} else if (value instanceof Organization) { } else if (value instanceof Organization) {
// nothing to evaluate here // nothing to evaluate here
} else if (value instanceof Relation) { } else if (value instanceof Relation) {
@ -294,6 +292,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
} else if (value instanceof Result) { } else if (value instanceof Result) {
Result r = (Result) value; Result r = (Result) value;
if (Objects.nonNull(r.getFulltext()) && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) ||
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
r.setFulltext(null);
}
if (Objects.nonNull(r.getDateofacceptance())) { if (Objects.nonNull(r.getDateofacceptance())) {
Optional<String> date = cleanDateField(r.getDateofacceptance()); Optional<String> date = cleanDateField(r.getDateofacceptance());
if (date.isPresent()) { if (date.isPresent()) {
@ -318,8 +322,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { if (Objects.nonNull(r.getPublisher())) {
if (StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null); r.setPublisher(null);
} else {
r.getPublisher().setValue(
r.getPublisher().getValue()
.replaceAll(NAME_CLEANING_REGEX, " ")
);
}
} }
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r r
@ -486,6 +497,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
i.setDateofacceptance(null); i.setDateofacceptance(null);
} }
} }
if (StringUtils.isNotBlank(i.getFulltext()) &&
(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) ||
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
i.setFulltext(null);
}
} }
} }
if (Objects.isNull(r.getBestaccessright()) if (Objects.isNull(r.getBestaccessright())
@ -510,6 +526,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(a -> StringUtils.isNotBlank(a.getFullname())) .filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.map(GraphCleaningFunctions::cleanupAuthor)
.collect(Collectors.toList())); .collect(Collectors.toList()));
boolean nullRank = r boolean nullRank = r
@ -604,6 +621,32 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return value; return value;
} }
private static Author cleanupAuthor(Author author) {
if (StringUtils.isNotBlank(author.getFullname())) {
author.setFullname(
author.getFullname()
.replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"")
);
}
if (StringUtils.isNotBlank(author.getName())) {
author.setName(
author.getName()
.replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"")
);
}
if (StringUtils.isNotBlank(author.getSurname())) {
author.setSurname(
author.getSurname()
.replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"")
);
}
return author;
}
private static Optional<String> cleanDateField(Field<String> dateofacceptance) { private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
return Optional return Optional
.ofNullable(dateofacceptance) .ofNullable(dateofacceptance)

View File

@ -251,6 +251,12 @@ public class GraphCleaningFunctionsTest {
pid.getQualifier().getClassname())); pid.getQualifier().getClassname()));
}); });
assertTrue(
p_cleaned
.getAuthor()
.stream()
.anyMatch(a -> "Brien, Tom".equals(a.getFullname())));
assertNotNull(p_cleaned.getSubject()); assertNotNull(p_cleaned.getSubject());
List<Subject> fos_subjects = p_cleaned List<Subject> fos_subjects = p_cleaned
@ -285,6 +291,31 @@ public class GraphCleaningFunctionsTest {
System.out.println(MAPPER.writeValueAsString(p_cleaned)); System.out.println(MAPPER.writeValueAsString(p_cleaned));
} }
@Test
void testCleaning_dataset() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_dataset.json"));
Dataset p_in = MAPPER.readValue(json, Dataset.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Dataset);
Dataset p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
assertNotNull(p_out);
assertNotNull(p_out.getPublisher());
assertNotNull(p_out.getPublisher().getValue());
Dataset p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
assertEquals("Best publisher in the world", p_cleaned.getPublisher().getValue());
}
private static void verify_keyword(Publication p_cleaned, String subject) { private static void verify_keyword(Publication p_cleaned, String subject) {
Optional<Subject> s1 = p_cleaned Optional<Subject> s1 = p_cleaned
.getSubject() .getSubject()

View File

@ -0,0 +1,28 @@
{
"resulttype": {
"classid": "dataset",
"classname": "dataset",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"fulltext": [
{
"value" : "https://www.researchgate.net"
}
],
"publisher" : {
"value" : "Best publisher in the world"
},
"id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375",
"instance": [
{
"instancetype": {
"classid": "Comment/debate",
"classname": "Comment/debate",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"fulltext": "https://www.researchgate.net"
}
]
}