indentation fixed
This commit is contained in:
parent
efe111de82
commit
46db6b02d3
|
@ -18,12 +18,12 @@ class JsonPathTest {
|
|||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
|
@ -58,7 +58,7 @@ class JsonPathTest {
|
|||
void testJPath2() throws IOException {
|
||||
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json")));
|
||||
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json")));
|
||||
|
||||
final String dat = IOUtils.toString(getClass().getResourceAsStream("dataset_example1.json"));
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup.local;
|
||||
|
||||
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.collections4.IteratorUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -16,178 +15,206 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.spark_project.guava.hash.Hashing;
|
||||
|
||||
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.convert.Wrappers;
|
||||
import scala.collection.mutable.ArrayBuffer;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class DedupLocalTestUtils {
|
||||
|
||||
public static String prepareTable(Row doc) {
|
||||
StringBuilder ret = new StringBuilder("<table>");
|
||||
public static String prepareTable(Row doc) {
|
||||
StringBuilder ret = new StringBuilder("<table>");
|
||||
|
||||
for(String fieldName: doc.schema().fieldNames()) {
|
||||
Object value = doc.getAs(fieldName);
|
||||
if(value.getClass() == String.class){
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
|
||||
}
|
||||
else if(value.getClass() == Wrappers.JListWrapper.class) {
|
||||
List<String> values = IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator()).asJava())
|
||||
.stream()
|
||||
.map(DedupLocalTestUtils::takeValue)
|
||||
.collect(Collectors.toList());
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
||||
}
|
||||
else if(value.getClass() == ArrayBuffer.class){
|
||||
List<String> values = new ArrayList<>(IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
||||
}
|
||||
for (String fieldName : doc.schema().fieldNames()) {
|
||||
Object value = doc.getAs(fieldName);
|
||||
if (value.getClass() == String.class) {
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
|
||||
} else if (value.getClass() == Wrappers.JListWrapper.class) {
|
||||
List<String> values = IteratorUtils
|
||||
.toList(
|
||||
JavaConverters
|
||||
.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator())
|
||||
.asJava())
|
||||
.stream()
|
||||
.map(DedupLocalTestUtils::takeValue)
|
||||
.collect(Collectors.toList());
|
||||
ret
|
||||
.append("<tr><th>")
|
||||
.append(fieldName)
|
||||
.append("</th><td>[")
|
||||
.append(String.join(";", values))
|
||||
.append("]</td></tr>");
|
||||
} else if (value.getClass() == ArrayBuffer.class) {
|
||||
List<String> values = new ArrayList<>(IteratorUtils
|
||||
.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
||||
ret
|
||||
.append("<tr><th>")
|
||||
.append(fieldName)
|
||||
.append("</th><td>[")
|
||||
.append(String.join(";", values))
|
||||
.append("]</td></tr>");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
ret.append("</table>");
|
||||
return ret.toString();
|
||||
ret.append("</table>");
|
||||
return ret.toString();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
protected static String fileToString(String filePath) throws IOException {
|
||||
protected static String fileToString(String filePath) throws IOException {
|
||||
|
||||
Path path=new Path(filePath);
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
|
||||
try {
|
||||
return String.join("", br.lines().collect(Collectors.toList()));
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
Path path = new Path(filePath);
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
|
||||
try {
|
||||
return String.join("", br.lines().collect(Collectors.toList()));
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath, String templateFilePath) {
|
||||
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath,
|
||||
String templateFilePath) {
|
||||
|
||||
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
|
||||
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
|
||||
|
||||
List<Node> nodes = entities.toJavaRDD().map(e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()), prepareTable(e))).collect();
|
||||
List<Node> nodes = entities
|
||||
.toJavaRDD()
|
||||
.map(
|
||||
e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()),
|
||||
prepareTable(e)))
|
||||
.collect();
|
||||
|
||||
List<Edge> edges = simRels.toJavaRDD().collect().stream().map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget()))).collect(Collectors.toList());
|
||||
List<Edge> edges = simRels
|
||||
.toJavaRDD()
|
||||
.collect()
|
||||
.stream()
|
||||
.map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget())))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
try(FileWriter fw = new FileWriter(filePath)) {
|
||||
String fullText = IOUtils.toString(new FileReader(templateFilePath));
|
||||
try (FileWriter fw = new FileWriter(filePath)) {
|
||||
String fullText = IOUtils.toString(new FileReader(templateFilePath));
|
||||
|
||||
String s = fullText
|
||||
.replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes))
|
||||
.replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges));
|
||||
String s = fullText
|
||||
.replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes))
|
||||
.replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges));
|
||||
|
||||
IOUtils.write(s, fw);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
IOUtils.write(s, fw);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public static long hash(final String id) {
|
||||
return Hashing.murmur3_128().hashString(id).asLong();
|
||||
}
|
||||
public static long hash(final String id) {
|
||||
return Hashing.murmur3_128().hashString(id).asLong();
|
||||
}
|
||||
|
||||
public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) {
|
||||
public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) {
|
||||
|
||||
String entityType = dedupConf.getWf().getEntityType();
|
||||
String entityType = dedupConf.getWf().getEntityType();
|
||||
|
||||
Relation r = new Relation();
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setRelClass(relClass);
|
||||
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
|
||||
r.setSubRelType(ModelConstants.DEDUP);
|
||||
return r;
|
||||
}
|
||||
Relation r = new Relation();
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setRelClass(relClass);
|
||||
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
|
||||
r.setSubRelType(ModelConstants.DEDUP);
|
||||
return r;
|
||||
}
|
||||
|
||||
public static OafEntity createOafEntity(String id, OafEntity base, long ts) {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
||||
res.setId(id);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
public static OafEntity createOafEntity(String id, OafEntity base, long ts) {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
||||
res.setId(id);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String takeValue(String json) {
|
||||
ObjectMapper mapper = new ObjectMapper(new JsonFactory());
|
||||
try {
|
||||
JsonNode rootNode = mapper.readTree(json);
|
||||
return rootNode.get("value").toString().replaceAll("\"", "");
|
||||
public static String takeValue(String json) {
|
||||
ObjectMapper mapper = new ObjectMapper(new JsonFactory());
|
||||
try {
|
||||
JsonNode rootNode = mapper.readTree(json);
|
||||
return rootNode.get("value").toString().replaceAll("\"", "");
|
||||
|
||||
} catch (Exception e) {
|
||||
return json;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return json;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class Node implements Serializable{
|
||||
String label;
|
||||
int id;
|
||||
String title;
|
||||
class Node implements Serializable {
|
||||
String label;
|
||||
int id;
|
||||
String title;
|
||||
|
||||
public Node(String label, int id, String title) {
|
||||
this.label = label;
|
||||
this.id = id;
|
||||
this.title = title;
|
||||
}
|
||||
public Node(String label, int id, String title) {
|
||||
this.label = label;
|
||||
this.id = id;
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
public void setLabel(String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
public int getId() {
|
||||
return id;
|
||||
}
|
||||
public int getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
public void setId(int id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
||||
|
||||
class Edge implements Serializable{
|
||||
int from;
|
||||
int to;
|
||||
class Edge implements Serializable {
|
||||
int from;
|
||||
int to;
|
||||
|
||||
public Edge(int from, int to) {
|
||||
this.from = from;
|
||||
this.to = to;
|
||||
}
|
||||
public Edge(int from, int to) {
|
||||
this.from = from;
|
||||
this.to = to;
|
||||
}
|
||||
|
||||
public int getFrom() {
|
||||
return from;
|
||||
}
|
||||
public int getFrom() {
|
||||
return from;
|
||||
}
|
||||
|
||||
public void setFrom(int from) {
|
||||
this.from = from;
|
||||
}
|
||||
public void setFrom(int from) {
|
||||
this.from = from;
|
||||
}
|
||||
|
||||
public int getTo() {
|
||||
return to;
|
||||
}
|
||||
public int getTo() {
|
||||
return to;
|
||||
}
|
||||
|
||||
public void setTo(int to) {
|
||||
this.to = to;
|
||||
}
|
||||
public void setTo(int to) {
|
||||
this.to = to;
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
|
|||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
||||
|
||||
Dataset<Row> dataset = spark
|
||||
.read()
|
||||
.schema(clazzEncoder.schema())
|
||||
.json(inputPath);
|
||||
.read()
|
||||
.schema(clazzEncoder.schema())
|
||||
.json(inputPath);
|
||||
|
||||
if (numPartitions > 0) {
|
||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||
|
|
|
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
|
|||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||
}
|
||||
|
||||
@Test def testDocumentationNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue