forked from D-Net/dnet-hadoop
parent
89b7bc84f2
commit
fed13e083e
|
@ -1,10 +1,5 @@
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
package eu.dnetlib.pace.tree;
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
import org.joda.time.DateTime;
|
|
||||||
|
|
||||||
import java.time.DateTimeException;
|
import java.time.DateTimeException;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
|
@ -13,55 +8,60 @@ import java.time.format.DateTimeFormatter;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("dateRange")
|
@ComparatorClass("dateRange")
|
||||||
public class DateRange extends AbstractStringComparator {
|
public class DateRange extends AbstractStringComparator {
|
||||||
|
|
||||||
int YEAR_RANGE;
|
int YEAR_RANGE;
|
||||||
|
|
||||||
public DateRange(Map<String, String> params) {
|
public DateRange(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public DateRange(final double weight) {
|
public DateRange(final double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
||||||
super(weight, ssalgo);
|
super(weight, ssalgo);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isNumeric(String str) {
|
public static boolean isNumeric(String str) {
|
||||||
return str.matches("\\d+"); //match a number with optional '-' and decimal.
|
return str.matches("\\d+"); // match a number with optional '-' and decimal.
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
if (a.isEmpty() || b.isEmpty()) {
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
return -1.0; // return -1 if a field is missing
|
return -1.0; // return -1 if a field is missing
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
||||||
LocalDate d1 = LocalDate.parse(a, formatter);
|
LocalDate d1 = LocalDate.parse(a, formatter);
|
||||||
LocalDate d2 = LocalDate.parse(b, formatter);
|
LocalDate d2 = LocalDate.parse(b, formatter);
|
||||||
Period period = Period.between(d1, d2);
|
Period period = Period.between(d1, d2);
|
||||||
|
|
||||||
return period.getYears() <= YEAR_RANGE? 1.0 : 0.0;
|
return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0;
|
||||||
}
|
} catch (DateTimeException e) {
|
||||||
catch (DateTimeException e) {
|
return -1.0;
|
||||||
return -1.0;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double getWeight() {
|
public double getWeight() {
|
||||||
return super.weight;
|
return super.weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double normalize(final double d) {
|
protected double normalize(final double d) {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,7 +62,7 @@ public class JsonListMatch extends AbstractListComparator {
|
||||||
|
|
||||||
Set<String> types = Sets.intersection(typesA, typesB);
|
Set<String> types = Sets.intersection(typesA, typesB);
|
||||||
|
|
||||||
if (types.isEmpty()) // if no common type, it is impossible to compare
|
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||||
|
|
|
@ -72,14 +72,34 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
CodeMatch codeMatch = new CodeMatch(params);
|
CodeMatch codeMatch = new CodeMatch(params);
|
||||||
|
|
||||||
// names have different codes
|
// names have different codes
|
||||||
assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf));
|
assertEquals(
|
||||||
|
0.0,
|
||||||
|
codeMatch
|
||||||
|
.distance(
|
||||||
|
"physical oceanography at ctd station june 1998 ev02a",
|
||||||
|
"physical oceanography at ctd station june 1998 ir02", conf));
|
||||||
|
|
||||||
// names have same code
|
// names have same code
|
||||||
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf));
|
assertEquals(
|
||||||
|
1.0,
|
||||||
|
codeMatch
|
||||||
|
.distance(
|
||||||
|
"physical oceanography at ctd station june 1998 ev02a",
|
||||||
|
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
|
||||||
// code is not in both names
|
// code is not in both names
|
||||||
assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf));
|
assertEquals(
|
||||||
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf));
|
-1,
|
||||||
|
codeMatch
|
||||||
|
.distance(
|
||||||
|
"physical oceanography at ctd station june 1998",
|
||||||
|
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
assertEquals(
|
||||||
|
1.0,
|
||||||
|
codeMatch
|
||||||
|
.distance(
|
||||||
|
"physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998",
|
||||||
|
conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -275,7 +295,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||||
"authors");
|
"authors");
|
||||||
List<String> b = createFieldList(
|
List<String> b = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
|
|
|
@ -130,7 +130,7 @@ public class ResultTagger implements Serializable {
|
||||||
// log.info("Remove constraints for " + communityId);
|
// log.info("Remove constraints for " + communityId);
|
||||||
if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
|
if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
|
||||||
conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
||||||
conf
|
conf
|
||||||
.getRemoveConstraintsMap()
|
.getRemoveConstraintsMap()
|
||||||
.get(communityId)
|
.get(communityId)
|
||||||
|
@ -228,7 +228,7 @@ public class ResultTagger implements Serializable {
|
||||||
.forEach(communityId -> {
|
.forEach(communityId -> {
|
||||||
if (!removeCommunities.contains(communityId) &&
|
if (!removeCommunities.contains(communityId) &&
|
||||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||||
!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
||||||
conf
|
conf
|
||||||
.getSelectionConstraintsMap()
|
.getSelectionConstraintsMap()
|
||||||
.get(communityId)
|
.get(communityId)
|
||||||
|
|
|
@ -915,7 +915,8 @@ class MappersTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testODFRecord_guidelines4() throws IOException {
|
void testODFRecord_guidelines4() throws IOException {
|
||||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml")));
|
final String xml = IOUtils
|
||||||
|
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml")));
|
||||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
|
|
||||||
final Publication p = (Publication) list.get(0);
|
final Publication p = (Publication) list.get(0);
|
||||||
|
|
|
@ -5,7 +5,6 @@ import java.io.StringReader;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.solr.PersonTopic;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
|
@ -40,6 +39,7 @@ import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
|
||||||
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
|
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
|
||||||
import eu.dnetlib.dhp.schema.solr.Organization;
|
import eu.dnetlib.dhp.schema.solr.Organization;
|
||||||
import eu.dnetlib.dhp.schema.solr.Person;
|
import eu.dnetlib.dhp.schema.solr.Person;
|
||||||
|
import eu.dnetlib.dhp.schema.solr.PersonTopic;
|
||||||
import eu.dnetlib.dhp.schema.solr.Pid;
|
import eu.dnetlib.dhp.schema.solr.Pid;
|
||||||
import eu.dnetlib.dhp.schema.solr.Project;
|
import eu.dnetlib.dhp.schema.solr.Project;
|
||||||
import eu.dnetlib.dhp.schema.solr.Result;
|
import eu.dnetlib.dhp.schema.solr.Result;
|
||||||
|
@ -216,11 +216,14 @@ public class ProvisionModelSupport {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<PersonTopic> mapPersonTopics(List<eu.dnetlib.dhp.schema.oaf.PersonTopic> subjects) {
|
private static List<PersonTopic> mapPersonTopics(List<eu.dnetlib.dhp.schema.oaf.PersonTopic> subjects) {
|
||||||
return Optional.ofNullable(subjects)
|
return Optional
|
||||||
.map(ss -> ss.stream()
|
.ofNullable(subjects)
|
||||||
.map(ProvisionModelSupport::mapPersonTopic)
|
.map(
|
||||||
.collect(Collectors.toList()))
|
ss -> ss
|
||||||
.orElse(null);
|
.stream()
|
||||||
|
.map(ProvisionModelSupport::mapPersonTopic)
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) {
|
private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) {
|
||||||
|
|
Loading…
Reference in New Issue