indentation fixed

This commit is contained in:
Michele De Bonis 2024-10-10 10:48:51 +02:00
parent efe111de82
commit 46db6b02d3
5 changed files with 479 additions and 427 deletions

View File

@ -18,12 +18,12 @@ class JsonPathTest {
@Test @Test
void jsonToModelTest() throws IOException { void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig DedupConfig conf = DedupConfig
.load( .load(
IOUtils IOUtils
.toString( .toString(
SparkOpenorgsDedupTest.class SparkOpenorgsDedupTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json")); final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
@ -58,7 +58,7 @@ class JsonPathTest {
void testJPath2() throws IOException { void testJPath2() throws IOException {
DedupConfig conf = DedupConfig DedupConfig conf = DedupConfig
.load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json"))); .load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_dataset.json")));
final String dat = IOUtils.toString(getClass().getResourceAsStream("dataset_example1.json")); final String dat = IOUtils.toString(getClass().getResourceAsStream("dataset_example1.json"));

View File

@ -1,12 +1,11 @@
package eu.dnetlib.dhp.oa.dedup.local; package eu.dnetlib.dhp.oa.dedup.local;
import com.cloudera.com.fasterxml.jackson.core.JsonFactory; import java.io.*;
import com.cloudera.com.fasterxml.jackson.databind.JsonNode; import java.util.ArrayList;
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper; import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.collections4.IteratorUtils; import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -16,178 +15,206 @@ import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.spark_project.guava.hash.Hashing; import org.spark_project.guava.hash.Hashing;
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.convert.Wrappers; import scala.collection.convert.Wrappers;
import scala.collection.mutable.ArrayBuffer; import scala.collection.mutable.ArrayBuffer;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public abstract class DedupLocalTestUtils { public abstract class DedupLocalTestUtils {
public static String prepareTable(Row doc) { public static String prepareTable(Row doc) {
StringBuilder ret = new StringBuilder("<table>"); StringBuilder ret = new StringBuilder("<table>");
for(String fieldName: doc.schema().fieldNames()) { for (String fieldName : doc.schema().fieldNames()) {
Object value = doc.getAs(fieldName); Object value = doc.getAs(fieldName);
if(value.getClass() == String.class){ if (value.getClass() == String.class) {
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>"); ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
} } else if (value.getClass() == Wrappers.JListWrapper.class) {
else if(value.getClass() == Wrappers.JListWrapper.class) { List<String> values = IteratorUtils
List<String> values = IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator()).asJava()) .toList(
.stream() JavaConverters
.map(DedupLocalTestUtils::takeValue) .asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator())
.collect(Collectors.toList()); .asJava())
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>"); .stream()
} .map(DedupLocalTestUtils::takeValue)
else if(value.getClass() == ArrayBuffer.class){ .collect(Collectors.toList());
List<String> values = new ArrayList<>(IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava())); ret
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>"); .append("<tr><th>")
} .append(fieldName)
.append("</th><td>[")
.append(String.join(";", values))
.append("]</td></tr>");
} else if (value.getClass() == ArrayBuffer.class) {
List<String> values = new ArrayList<>(IteratorUtils
.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
ret
.append("<tr><th>")
.append(fieldName)
.append("</th><td>[")
.append(String.join(";", values))
.append("]</td></tr>");
}
} }
ret.append("</table>"); ret.append("</table>");
return ret.toString(); return ret.toString();
} }
protected static String fileToString(String filePath) throws IOException { protected static String fileToString(String filePath) throws IOException {
Path path=new Path(filePath); Path path = new Path(filePath);
FileSystem fs = FileSystem.get(new Configuration()); FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
try { try {
return String.join("", br.lines().collect(Collectors.toList())); return String.join("", br.lines().collect(Collectors.toList()));
} finally { } finally {
br.close(); br.close();
} }
} }
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath, String templateFilePath) { public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath,
String templateFilePath) {
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect(); List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
List<Node> nodes = entities.toJavaRDD().map(e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()), prepareTable(e))).collect(); List<Node> nodes = entities
.toJavaRDD()
.map(
e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()),
prepareTable(e)))
.collect();
List<Edge> edges = simRels.toJavaRDD().collect().stream().map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget()))).collect(Collectors.toList()); List<Edge> edges = simRels
.toJavaRDD()
.collect()
.stream()
.map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget())))
.collect(Collectors.toList());
try(FileWriter fw = new FileWriter(filePath)) { try (FileWriter fw = new FileWriter(filePath)) {
String fullText = IOUtils.toString(new FileReader(templateFilePath)); String fullText = IOUtils.toString(new FileReader(templateFilePath));
String s = fullText String s = fullText
.replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes)) .replaceAll("%nodes%", new ObjectMapper().writeValueAsString(nodes))
.replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges)); .replaceAll("%edges%", new ObjectMapper().writeValueAsString(edges));
IOUtils.write(s, fw); IOUtils.write(s, fw);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
public static long hash(final String id) { public static long hash(final String id) {
return Hashing.murmur3_128().hashString(id).asLong(); return Hashing.murmur3_128().hashString(id).asLong();
} }
public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) { public static Relation createRel(String source, String target, String relClass, DedupConfig dedupConf) {
String entityType = dedupConf.getWf().getEntityType(); String entityType = dedupConf.getWf().getEntityType();
Relation r = new Relation(); Relation r = new Relation();
r.setSource(source); r.setSource(source);
r.setTarget(target); r.setTarget(target);
r.setRelClass(relClass); r.setRelClass(relClass);
r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1)); r.setRelType(entityType + entityType.substring(0, 1).toUpperCase() + entityType.substring(1));
r.setSubRelType(ModelConstants.DEDUP); r.setSubRelType(ModelConstants.DEDUP);
return r; return r;
} }
public static OafEntity createOafEntity(String id, OafEntity base, long ts) { public static OafEntity createOafEntity(String id, OafEntity base, long ts) {
try { try {
OafEntity res = (OafEntity) BeanUtils.cloneBean(base); OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
res.setId(id); res.setId(id);
res.setLastupdatetimestamp(ts); res.setLastupdatetimestamp(ts);
return res; return res;
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public static String takeValue(String json) { public static String takeValue(String json) {
ObjectMapper mapper = new ObjectMapper(new JsonFactory()); ObjectMapper mapper = new ObjectMapper(new JsonFactory());
try { try {
JsonNode rootNode = mapper.readTree(json); JsonNode rootNode = mapper.readTree(json);
return rootNode.get("value").toString().replaceAll("\"", ""); return rootNode.get("value").toString().replaceAll("\"", "");
} catch (Exception e) { } catch (Exception e) {
return json; return json;
} }
} }
} }
class Node implements Serializable{ class Node implements Serializable {
String label; String label;
int id; int id;
String title; String title;
public Node(String label, int id, String title) { public Node(String label, int id, String title) {
this.label = label; this.label = label;
this.id = id; this.id = id;
this.title = title; this.title = title;
} }
public String getLabel() { public String getLabel() {
return label; return label;
} }
public void setLabel(String label) { public void setLabel(String label) {
this.label = label; this.label = label;
} }
public int getId() { public int getId() {
return id; return id;
} }
public void setId(int id) { public void setId(int id) {
this.id = id; this.id = id;
} }
public String getTitle() { public String getTitle() {
return title; return title;
} }
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }
} }
class Edge implements Serializable{ class Edge implements Serializable {
int from; int from;
int to; int to;
public Edge(int from, int to) { public Edge(int from, int to) {
this.from = from; this.from = from;
this.to = to; this.to = to;
} }
public int getFrom() { public int getFrom() {
return from; return from;
} }
public void setFrom(int from) { public void setFrom(int from) {
this.from = from; this.from = from;
} }
public int getTo() { public int getTo() {
return to; return to;
} }
public void setTo(int to) { public void setTo(int to) {
this.to = to; this.to = to;
} }
} }

View File

@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob {
final Encoder<T> clazzEncoder = Encoders.bean(clazz); final Encoder<T> clazzEncoder = Encoders.bean(clazz);
Dataset<Row> dataset = spark Dataset<Row> dataset = spark
.read() .read()
.schema(clazzEncoder.schema()) .schema(clazzEncoder.schema())
.json(inputPath); .json(inputPath);
if (numPartitions > 0) { if (numPartitions > 0) {
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);

View File

@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
} }
@Test def testDocumentationNames(): Unit = { @Test def testDocumentationNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones")) assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
} }