indentation fixed
This commit is contained in:
parent
efe111de82
commit
46db6b02d3
|
@ -18,12 +18,12 @@ class JsonPathTest {
|
||||||
@Test
|
@Test
|
||||||
void jsonToModelTest() throws IOException {
|
void jsonToModelTest() throws IOException {
|
||||||
DedupConfig conf = DedupConfig
|
DedupConfig conf = DedupConfig
|
||||||
.load(
|
.load(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
SparkOpenorgsDedupTest.class
|
SparkOpenorgsDedupTest.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
|
||||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class JsonPathTest {
|
||||||
void testJPath2() throws IOException {
|
void testJPath2() throws IOException {
|
||||||
|
|
||||||
DedupConfig conf = DedupConfig
|
DedupConfig conf = DedupConfig
|
||||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json")));
|
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json")));
|
||||||
|
|
||||||
final String dat = IOUtils.toString(getClass().getResourceAsStream("dataset_example1.json"));
|
final String dat = IOUtils.toString(getClass().getResourceAsStream("dataset_example1.json"));
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup.local;
|
package eu.dnetlib.dhp.oa.dedup.local;
|
||||||
|
|
||||||
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
import java.io.*;
|
||||||
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
import java.util.ArrayList;
|
||||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
import java.util.List;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import java.util.stream.Collectors;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import org.apache.commons.beanutils.BeanUtils;
|
import org.apache.commons.beanutils.BeanUtils;
|
||||||
import org.apache.commons.collections4.IteratorUtils;
|
import org.apache.commons.collections4.IteratorUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -16,178 +15,206 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
import org.spark_project.guava.hash.Hashing;
|
import org.spark_project.guava.hash.Hashing;
|
||||||
|
|
||||||
|
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
||||||
|
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import scala.collection.JavaConverters;
|
import scala.collection.JavaConverters;
|
||||||
import scala.collection.convert.Wrappers;
|
import scala.collection.convert.Wrappers;
|
||||||
import scala.collection.mutable.ArrayBuffer;
|
import scala.collection.mutable.ArrayBuffer;
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public abstract class DedupLocalTestUtils {
|
public abstract class DedupLocalTestUtils {
|
||||||
|
|
||||||
public static String prepareTable(Row doc) {
|
public static String prepareTable(Row doc) {
|
||||||
StringBuilder ret = new StringBuilder("<table>");
|
StringBuilder ret = new StringBuilder("<table>");
|
||||||
|
|
||||||
for(String fieldName: doc.schema().fieldNames()) {
|
for (String fieldName : doc.schema().fieldNames()) {
|
||||||
Object value = doc.getAs(fieldName);
|
Object value = doc.getAs(fieldName);
|
||||||
if(value.getClass() == String.class){
|
if (value.getClass() == String.class) {
|
||||||
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
|
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
|
||||||
}
|
} else if (value.getClass() == Wrappers.JListWrapper.class) {
|
||||||
else if(value.getClass() == Wrappers.JListWrapper.class) {
|
List<String> values = IteratorUtils
|
||||||
List<String> values = IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator()).asJava())
|
.toList(
|
||||||
.stream()
|
JavaConverters
|
||||||
.map(DedupLocalTestUtils::takeValue)
|
.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator())
|
||||||
.collect(Collectors.toList());
|
.asJava())
|
||||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
.stream()
|
||||||
}
|
.map(DedupLocalTestUtils::takeValue)
|
||||||
else if(value.getClass() == ArrayBuffer.class){
|
.collect(Collectors.toList());
|
||||||
List<String> values = new ArrayList<>(IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
ret
|
||||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
.append("<tr><th>")
|
||||||
}
|
.append(fieldName)
|
||||||
|
.append("</th><td>[")
|
||||||
|
.append(String.join(";", values))
|
||||||
|
.append("]</td></tr>");
|
||||||
|
} else if (value.getClass() == ArrayBuffer.class) {
|
||||||
|
List<String> values = new ArrayList<>(IteratorUtils
|
||||||
|
.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
||||||
|
ret
|
||||||
|
.append("<tr><th>")
|
||||||
|
.append(fieldName)
|
||||||
|
.append("</th><td>[")
|
||||||
|
.append(String.join(";", values))
|
||||||
|
.append("]</td></tr>");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.append("</table>");
|
ret.append("</table>");
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static String fileToString(String filePath) throws IOException {
|
protected static String fileToString(String filePath) throws IOException {
|
||||||
|
|
||||||
Path path=new Path(filePath);
|
Path path = new Path(filePath);
|
||||||
FileSystem fs = FileSystem.get(new Configuration());
|
FileSystem fs = FileSystem.get(new Configuration());
|
||||||
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
|
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
|
||||||
try {
|
try {
|
||||||
return String.join("", br.lines().collect(Collectors.toList()));
|
return String.join("", br.lines().collect(Collectors.toList()));
|
||||||
} finally {
|
} finally {
|
||||||
br.close();
|
br.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath, String templateFilePath) {
|
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath,
|
||||||
|
String templateFilePath) {
|
||||||
|
|
||||||
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
|
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
|
||||||
|
|
||||||
List<Node> nodes = entities.toJavaRDD().map(e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()), prepareTable(e))).collect();
|
List<Node> nodes = entities
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(
|
||||||
|
e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()),
|
||||||
|
prepareTable(e)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
List<Edge> edges = simRels.toJavaRDD().collect().stream().map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget()))).collect(Collectors.toList());
|
List<Edge> edges = simRels
|
||||||
|
.toJavaRDD()
|
||||||
|
.collect()
|
||||||
|
.stream()
|
||||||
|
.map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget())))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
try(FileWriter fw = new FileWriter(filePath)) {
|
try (FileWriter fw = new FileWriter(filePath)) {
|
||||||
String fullText = IOUtils.toString(new FileReader(templateFilePath));
|
String fullText = IOUtils.toString(new FileReader(templateFilePath));
|
||||||
|
|
||||||
String s = fullText
|
String s = fullText
|
||||||
.replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes))
|
.replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes))
|
||||||
.replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges));
|
.replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges));
|
||||||
|
|
||||||
IOUtils.write(s, fw);
|
IOUtils.write(s, fw);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long hash(final String id) {
|
public static long hash(final String id) {
|
||||||
return Hashing.murmur3_128().hashString(id).asLong();
|
return Hashing.murmur3_128().hashString(id).asLong();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) {
|
public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) {
|
||||||
|
|
||||||
String entityType = dedupConf.getWf().getEntityType();
|
String entityType = dedupConf.getWf().getEntityType();
|
||||||
|
|
||||||
Relation r = new Relation();
|
Relation r = new Relation();
|
||||||
r.setSource(source);
|
r.setSource(source);
|
||||||
r.setTarget(target);
|
r.setTarget(target);
|
||||||
r.setRelClass(relClass);
|
r.setRelClass(relClass);
|
||||||
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
|
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
|
||||||
r.setSubRelType(ModelConstants.DEDUP);
|
r.setSubRelType(ModelConstants.DEDUP);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static OafEntity createOafEntity(String id, OafEntity base, long ts) {
|
public static OafEntity createOafEntity(String id, OafEntity base, long ts) {
|
||||||
try {
|
try {
|
||||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
||||||
res.setId(id);
|
res.setId(id);
|
||||||
res.setLastupdatetimestamp(ts);
|
res.setLastupdatetimestamp(ts);
|
||||||
return res;
|
return res;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String takeValue(String json) {
|
public static String takeValue(String json) {
|
||||||
ObjectMapper mapper = new ObjectMapper(new JsonFactory());
|
ObjectMapper mapper = new ObjectMapper(new JsonFactory());
|
||||||
try {
|
try {
|
||||||
JsonNode rootNode = mapper.readTree(json);
|
JsonNode rootNode = mapper.readTree(json);
|
||||||
return rootNode.get("value").toString().replaceAll("\"", "");
|
return rootNode.get("value").toString().replaceAll("\"", "");
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return json;
|
return json;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class Node implements Serializable{
|
class Node implements Serializable {
|
||||||
String label;
|
String label;
|
||||||
int id;
|
int id;
|
||||||
String title;
|
String title;
|
||||||
|
|
||||||
public Node(String label, int id, String title) {
|
public Node(String label, int id, String title) {
|
||||||
this.label = label;
|
this.label = label;
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.title = title;
|
this.title = title;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLabel() {
|
public String getLabel() {
|
||||||
return label;
|
return label;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLabel(String label) {
|
public void setLabel(String label) {
|
||||||
this.label = label;
|
this.label = label;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getId() {
|
public int getId() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setId(int id) {
|
public void setId(int id) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTitle(String title) {
|
public void setTitle(String title) {
|
||||||
this.title = title;
|
this.title = title;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class Edge implements Serializable{
|
class Edge implements Serializable {
|
||||||
int from;
|
int from;
|
||||||
int to;
|
int to;
|
||||||
|
|
||||||
public Edge(int from, int to) {
|
public Edge(int from, int to) {
|
||||||
this.from = from;
|
this.from = from;
|
||||||
this.to = to;
|
this.to = to;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getFrom() {
|
public int getFrom() {
|
||||||
return from;
|
return from;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFrom(int from) {
|
public void setFrom(int from) {
|
||||||
this.from = from;
|
this.from = from;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getTo() {
|
public int getTo() {
|
||||||
return to;
|
return to;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTo(int to) {
|
public void setTo(int to) {
|
||||||
this.to = to;
|
this.to = to;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
|
||||||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
||||||
|
|
||||||
Dataset<Row> dataset = spark
|
Dataset<Row> dataset = spark
|
||||||
.read()
|
.read()
|
||||||
.schema(clazzEncoder.schema())
|
.schema(clazzEncoder.schema())
|
||||||
.json(inputPath);
|
.json(inputPath);
|
||||||
|
|
||||||
if (numPartitions > 0) {
|
if (numPartitions > 0) {
|
||||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||||
|
|
|
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test def testDocumentationNames(): Unit = {
|
@Test def testDocumentationNames(): Unit = {
|
||||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue