indentation fixed
This commit is contained in:
parent
efe111de82
commit
46db6b02d3
|
@ -1,12 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup.local;
|
||||
|
||||
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.collections4.IteratorUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -16,15 +15,19 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.spark_project.guava.hash.Hashing;
|
||||
|
||||
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.convert.Wrappers;
|
||||
import scala.collection.mutable.ArrayBuffer;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class DedupLocalTestUtils {
|
||||
|
||||
public static String prepareTable(Row doc) {
|
||||
|
@ -34,17 +37,30 @@ public abstract class DedupLocalTestUtils {
|
|||
Object value = doc.getAs(fieldName);
|
||||
if (value.getClass() == String.class) {
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
|
||||
}
|
||||
else if(value.getClass() == Wrappers.JListWrapper.class) {
|
||||
List<String> values = IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator()).asJava())
|
||||
} else if (value.getClass() == Wrappers.JListWrapper.class) {
|
||||
List<String> values = IteratorUtils
|
||||
.toList(
|
||||
JavaConverters
|
||||
.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator())
|
||||
.asJava())
|
||||
.stream()
|
||||
.map(DedupLocalTestUtils::takeValue)
|
||||
.collect(Collectors.toList());
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
||||
}
|
||||
else if(value.getClass() == ArrayBuffer.class){
|
||||
List<String> values = new ArrayList<>(IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
||||
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
|
||||
ret
|
||||
.append("<tr><th>")
|
||||
.append(fieldName)
|
||||
.append("</th><td>[")
|
||||
.append(String.join(";", values))
|
||||
.append("]</td></tr>");
|
||||
} else if (value.getClass() == ArrayBuffer.class) {
|
||||
List<String> values = new ArrayList<>(IteratorUtils
|
||||
.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
|
||||
ret
|
||||
.append("<tr><th>")
|
||||
.append(fieldName)
|
||||
.append("</th><td>[")
|
||||
.append(String.join(";", values))
|
||||
.append("]</td></tr>");
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -66,13 +82,24 @@ public abstract class DedupLocalTestUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath, String templateFilePath) {
|
||||
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath,
|
||||
String templateFilePath) {
|
||||
|
||||
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
|
||||
|
||||
List<Node> nodes = entities.toJavaRDD().map(e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()), prepareTable(e))).collect();
|
||||
List<Node> nodes = entities
|
||||
.toJavaRDD()
|
||||
.map(
|
||||
e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()),
|
||||
prepareTable(e)))
|
||||
.collect();
|
||||
|
||||
List<Edge> edges = simRels.toJavaRDD().collect().stream().map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget()))).collect(Collectors.toList());
|
||||
List<Edge> edges = simRels
|
||||
.toJavaRDD()
|
||||
.collect()
|
||||
.stream()
|
||||
.map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget())))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
try (FileWriter fw = new FileWriter(filePath)) {
|
||||
String fullText = IOUtils.toString(new FileReader(templateFilePath));
|
||||
|
|
|
@ -1,19 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup.local;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.kwartile.lib.cc.ConnectedComponent;
|
||||
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.SparkDeduper;
|
||||
import eu.dnetlib.pace.model.SparkModel;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import java.awt.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
||||
|
@ -26,20 +22,27 @@ import org.junit.jupiter.api.*;
|
|||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.junit.platform.commons.util.StringUtils;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.kwartile.lib.cc.ConnectedComponent;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.SparkDeduper;
|
||||
import eu.dnetlib.pace.model.SparkModel;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import scala.Tuple2;
|
||||
import scala.Tuple3;
|
||||
import scala.collection.JavaConversions;
|
||||
import scala.collection.mutable.WrappedArray;
|
||||
|
||||
import java.awt.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
|
@ -50,12 +53,18 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
static JavaSparkContext context;
|
||||
|
||||
final String entitiesPath = Paths
|
||||
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
|
||||
.get(
|
||||
Objects
|
||||
.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication"))
|
||||
.toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
final String dedupConfPath = Paths
|
||||
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
|
||||
.get(
|
||||
Objects
|
||||
.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))
|
||||
.toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
|
@ -99,9 +108,11 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
Dataset<Relation> simRels = entities
|
||||
.transform(deduper.dedup())
|
||||
.distinct()
|
||||
.map((MapFunction<Row, Relation>) t ->
|
||||
DedupUtility.createSimRel(t.getStruct(0).getString(0), t.getStruct(0).getString(1), config.getWf().getEntityType()), Encoders.bean(Relation.class)
|
||||
);
|
||||
.map(
|
||||
(MapFunction<Row, Relation>) t -> DedupUtility
|
||||
.createSimRel(
|
||||
t.getStruct(0).getString(0), t.getStruct(0).getString(1), config.getWf().getEntityType()),
|
||||
Encoders.bean(Relation.class));
|
||||
|
||||
long simrels_time = System.currentTimeMillis() - before_simrels;
|
||||
|
||||
|
@ -131,7 +142,6 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
System.out.println(r.getSource() + " ---> " + r.getTarget());
|
||||
}
|
||||
|
||||
|
||||
// resolve connected components
|
||||
// ("vertexId", "groupId")
|
||||
Dataset<Row> cliques = ConnectedComponent
|
||||
|
@ -155,18 +165,23 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
return res.iterator();
|
||||
}, Encoders.bean(Relation.class));
|
||||
|
||||
|
||||
long mergerels_time = System.currentTimeMillis() - before_mergerels;
|
||||
|
||||
long mergerels_number = mergeRels.count();
|
||||
|
||||
long before_dedupentity = System.currentTimeMillis();
|
||||
|
||||
final Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(config.getWf().getSubEntityValue()));
|
||||
final Class<OafEntity> clazz = ModelSupport.entityTypes
|
||||
.get(EntityType.valueOf(config.getWf().getSubEntityValue()));
|
||||
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
|
||||
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
|
||||
|
||||
Dataset<Row> kryoEntities = spark.read().schema(Encoders.bean(clazz).schema()).json(entitiesPath).as(beanEncoder).map(
|
||||
Dataset<Row> kryoEntities = spark
|
||||
.read()
|
||||
.schema(Encoders.bean(clazz).schema())
|
||||
.json(entitiesPath)
|
||||
.as(beanEncoder)
|
||||
.map(
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
|
||||
return new Tuple2<>(entity.getId(), entity);
|
||||
},
|
||||
|
@ -279,7 +294,8 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
|
||||
List<Row> rows = Lists.newArrayList(a, b);
|
||||
|
||||
Dataset<Row> rowsDS = spark.createDataset(rows, RowEncoder.apply(model.schema()))
|
||||
Dataset<Row> rowsDS = spark
|
||||
.createDataset(rows, RowEncoder.apply(model.schema()))
|
||||
.transform(deduper.filterAndCleanup())
|
||||
.transform(deduper.generateClustersWithCollect());
|
||||
|
||||
|
@ -296,7 +312,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
|
|||
entities,
|
||||
simRels,
|
||||
"/tmp/graph.html",
|
||||
Paths.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/visualization_tools/graph_template.html")).toURI()).toFile().getAbsolutePath());
|
||||
Paths
|
||||
.get(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
SparkDedupLocalTest.class
|
||||
.getResource("/eu/dnetlib/dhp/dedup/visualization_tools/graph_template.html"))
|
||||
.toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath());
|
||||
Desktop.getDesktop().browse(new File("/tmp/graph.html").toURI());
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
|
|
|
@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
|
|||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||
}
|
||||
|
||||
@Test def testDocumentationNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue