implemented DedupRecord factory with the merge of organizations

This commit is contained in:
Sandro La Bruzzo 2019-12-11 16:57:37 +01:00
parent abd9034da0
commit 6b45e37e22
7 changed files with 113 additions and 1809 deletions

View File

@ -164,4 +164,27 @@ public class Organization extends OafEntity implements Serializable {
public void setCountry(Qualifier country) { public void setCountry(Qualifier country) {
this.country = country; this.country = country;
} }
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
final Organization o = (Organization) e;
legalshortname = o.getLegalshortname() != null ? o.getLegalshortname() : legalshortname;
legalname = o.getLegalname() != null ? o.getLegalname() : legalname;
alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames);
websiteurl = o.getWebsiteurl() != null ? o.getWebsiteurl() : websiteurl;
logourl = o.getLogourl() != null ? o.getLogourl() : logourl;
eclegalbody = o.getEclegalbody() != null ? o.getEclegalbody() : eclegalbody;
eclegalperson = o.getEclegalperson() != null ? o.getEclegalperson() : eclegalperson;
ecnonprofit = o.getEcnonprofit() != null ? o.getEcnonprofit() : ecnonprofit;
ecresearchorganization = o.getEcresearchorganization() != null ? o.getEcresearchorganization() : ecresearchorganization;
echighereducation = o.getEchighereducation() != null ? o.getEchighereducation() : echighereducation;
ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null ? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests;
ecinternationalorganization = o.getEcinternationalorganization() != null ? o.getEcinternationalorganization() : ecinternationalorganization;
ecenterprise = o.getEcenterprise() != null ? o.getEcenterprise() :ecenterprise;
ecsmevalidated = o.getEcsmevalidated() != null ? o.getEcsmevalidated() :ecsmevalidated;
ecnutscode = o.getEcnutscode() != null ? o.getEcnutscode() :ecnutscode;
country = o.getCountry() != null ? o.getCountry() :country;
}
} }

View File

@ -3,10 +3,7 @@ package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.*;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public abstract class Result extends OafEntity implements Serializable { public abstract class Result extends OafEntity implements Serializable {
@ -253,11 +250,6 @@ public abstract class Result extends OafEntity implements Serializable {
Result r = (Result) e; Result r = (Result) e;
//TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
// dateofacceptance = r.getDateofacceptance();
instance = mergeLists(instance, r.getInstance()); instance = mergeLists(instance, r.getInstance());
if (r.getResulttype() != null) if (r.getResulttype() != null)
@ -274,7 +266,7 @@ public abstract class Result extends OafEntity implements Serializable {
relevantdate = mergeLists(relevantdate, r.getRelevantdate()); relevantdate = mergeLists(relevantdate, r.getRelevantdate());
description = mergeLists(description, r.getDescription()); description = longestLists(description, r.getDescription());
if (r.getPublisher() != null) if (r.getPublisher() != null)
publisher = r.getPublisher(); publisher = r.getPublisher();
@ -310,5 +302,16 @@ public abstract class Result extends OafEntity implements Serializable {
} }
private List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
if(a == null || b == null)
return a==null?b:a;
if (a.size()== b.size()) {
int msa = a.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
int msb = b.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
return msa>msb?a:b;
}
return a.size()> b.size()?a:b;
}
} }

View File

@ -35,7 +35,7 @@ public class StructuredProperty implements Serializable {
} }
public String toComparableString(){ public String toComparableString(){
return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : ""); return value != null ? value.toLowerCase() : "";
} }
@Override @Override

View File

@ -72,10 +72,6 @@ public class DedupRecordFactory {
.groupByKey(); .groupByKey();
switch (entityType) { switch (entityType) {
case publication: case publication:
return sortedJoinResult.map(DedupRecordFactory::publicationMerger); return sortedJoinResult.map(DedupRecordFactory::publicationMerger);
@ -155,7 +151,33 @@ public class DedupRecordFactory {
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e) { private static Organization organizationMerger(Tuple2<String, Iterable<String>> e) {
throw new NotImplementedException(); Organization o = new Organization(); //the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
StringBuilder trust = new StringBuilder("0.0");
if (e._2() != null)
e._2().forEach(pub -> {
try {
Organization organization = mapper.readValue(pub, Organization.class);
final String currentTrust = organization.getDataInfo().getTrust();
if (!"1.0".equals(currentTrust)) {
trust.setLength(0);
trust.append(currentTrust);
}
o.mergeFrom(organization);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
return o;
} }
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e) { private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e) {

View File

@ -16,10 +16,11 @@ import java.util.List;
public class SparkCreateDedupTest { public class SparkCreateDedupTest {
String configuration; String configuration;
String entity = "organization";
@Before @Before
public void setUp() throws IOException { public void setUp() throws IOException {
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json")); configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
} }
@ -29,7 +30,7 @@ public class SparkCreateDedupTest {
SparkCreateSimRels.main(new String[] { SparkCreateSimRels.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/home/sandro/betadump",
"-e", "publication", "-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration), "-c", ArgumentApplicationParser.compressArgument(configuration),
"-t", "/tmp/dedup", "-t", "/tmp/dedup",
}); });
@ -42,7 +43,7 @@ public class SparkCreateDedupTest {
SparkCreateConnectedComponent.main(new String[] { SparkCreateConnectedComponent.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/home/sandro/betadump",
"-e", "publication", "-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration), "-c", ArgumentApplicationParser.compressArgument(configuration),
"-t", "/tmp/dedup", "-t", "/tmp/dedup",
}); });
@ -54,7 +55,7 @@ public class SparkCreateDedupTest {
SparkCreateDedupRecord.main(new String[] { SparkCreateDedupRecord.main(new String[] {
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/home/sandro/betadump", "-s", "/home/sandro/betadump",
"-e", "publication", "-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration), "-c", ArgumentApplicationParser.compressArgument(configuration),
"-d", "/tmp/dedup", "-d", "/tmp/dedup",
}); });

View File

@ -7,7 +7,7 @@
"queueMaxSize": "2000", "queueMaxSize": "2000",
"groupMaxSize": "50", "groupMaxSize": "50",
"slidingWindowSize": "200", "slidingWindowSize": "200",
"idPath": ".id", "idPath": "$.id",
"rootBuilder": [ "rootBuilder": [
"organization", "organization",
"projectOrganization_participation_isParticipant", "projectOrganization_participation_isParticipant",
@ -84,7 +84,7 @@
"type": "String", "type": "String",
"weight": "0", "weight": "0",
"ignoreMissing": "false", "ignoreMissing": "false",
"path": ".country.classid" "path": "$.country.classid"
}, },
{ {
"name": "legalshortname", "name": "legalshortname",
@ -92,7 +92,7 @@
"type": "String", "type": "String",
"weight": "0.1", "weight": "0.1",
"ignoreMissing": "true", "ignoreMissing": "true",
"path": ".legalshortname.value" "path": "$.legalshortname.value"
}, },
{ {
"name": "legalname", "name": "legalname",
@ -100,7 +100,7 @@
"type": "String", "type": "String",
"weight": "0.9", "weight": "0.9",
"ignoreMissing": "false", "ignoreMissing": "false",
"path": ".legalname.value", "path": "$.legalname.value",
"params": { "params": {
"windowSize": 4, "windowSize": 4,
"threshold": 0.7 "threshold": 0.7
@ -112,11 +112,19 @@
"type": "URL", "type": "URL",
"weight": "0", "weight": "0",
"ignoreMissing": "true", "ignoreMissing": "true",
"path": ".websiteurl.value", "path": "$.websiteurl.value",
"params": { "params": {
"host": 0.5, "host": 0.5,
"path": 0.5 "path": 0.5
} }
},
{
"name": "gridid",
"algo": "Null",
"type": "String",
"weight": "0.0",
"ignoreMissing": "true",
"path": "$.pid[?(@.qualifier.classid ==\"grid\")].value"
} }
], ],
"blacklists": { "blacklists": {