indentation fixed

This commit is contained in:
Michele De Bonis 2024-10-10 10:48:51 +02:00
parent efe111de82
commit 46db6b02d3
5 changed files with 479 additions and 427 deletions

View File

@ -1,12 +1,11 @@
package eu.dnetlib.dhp.oa.dedup.local;
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.io.IOUtils;
@ -16,15 +15,19 @@ import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.spark_project.guava.hash.Hashing;
import com.cloudera.com.fasterxml.jackson.core.JsonFactory;
import com.cloudera.com.fasterxml.jackson.databind.JsonNode;
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import scala.collection.JavaConverters;
import scala.collection.convert.Wrappers;
import scala.collection.mutable.ArrayBuffer;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public abstract class DedupLocalTestUtils {
public static String prepareTable(Row doc) {
@ -34,17 +37,30 @@ public abstract class DedupLocalTestUtils {
Object value = doc.getAs(fieldName);
if (value.getClass() == String.class) {
ret.append("<tr><th>").append(fieldName).append("</th><td>").append(value).append("</td></tr>");
}
else if(value.getClass() == Wrappers.JListWrapper.class) {
List<String> values = IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator()).asJava())
} else if (value.getClass() == Wrappers.JListWrapper.class) {
List<String> values = IteratorUtils
.toList(
JavaConverters
.asJavaIteratorConverter(((Wrappers.JListWrapper<String>) value).iterator())
.asJava())
.stream()
.map(DedupLocalTestUtils::takeValue)
.collect(Collectors.toList());
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
}
else if(value.getClass() == ArrayBuffer.class){
List<String> values = new ArrayList<>(IteratorUtils.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
ret.append("<tr><th>").append(fieldName).append("</th><td>[").append(String.join(";", values)).append("]</td></tr>");
ret
.append("<tr><th>")
.append(fieldName)
.append("</th><td>[")
.append(String.join(";", values))
.append("]</td></tr>");
} else if (value.getClass() == ArrayBuffer.class) {
List<String> values = new ArrayList<>(IteratorUtils
.toList(JavaConverters.asJavaIteratorConverter(((ArrayBuffer<String>) value).iterator()).asJava()));
ret
.append("<tr><th>")
.append(fieldName)
.append("</th><td>[")
.append(String.join(";", values))
.append("]</td></tr>");
}
}
@ -66,13 +82,24 @@ public abstract class DedupLocalTestUtils {
}
}
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath, String templateFilePath) {
public static void prepareGraphParams(Dataset<Row> entities, Dataset<Relation> simRels, String filePath,
String templateFilePath) {
List<String> vertexes = entities.toJavaRDD().map(r -> r.getAs("identifier").toString()).collect();
List<Node> nodes = entities.toJavaRDD().map(e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()), prepareTable(e))).collect();
List<Node> nodes = entities
.toJavaRDD()
.map(
e -> new Node(e.getAs("identifier").toString(), vertexes.indexOf(e.getAs("identifier").toString()),
prepareTable(e)))
.collect();
List<Edge> edges = simRels.toJavaRDD().collect().stream().map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget()))).collect(Collectors.toList());
List<Edge> edges = simRels
.toJavaRDD()
.collect()
.stream()
.map(sr -> new Edge(vertexes.indexOf(sr.getSource()), vertexes.indexOf(sr.getTarget())))
.collect(Collectors.toList());
try (FileWriter fw = new FileWriter(filePath)) {
String fullText = IOUtils.toString(new FileReader(templateFilePath));

View File

@ -1,19 +1,15 @@
package eu.dnetlib.dhp.oa.dedup.local;
import com.google.common.collect.Lists;
import com.kwartile.lib.cc.ConnectedComponent;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDeduper;
import eu.dnetlib.pace.model.SparkModel;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.stream.Stream;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
@ -26,20 +22,27 @@ import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.platform.commons.util.StringUtils;
import org.mockito.junit.jupiter.MockitoExtension;
import com.google.common.collect.Lists;
import com.kwartile.lib.cc.ConnectedComponent;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDeduper;
import eu.dnetlib.pace.model.SparkModel;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.JavaConversions;
import scala.collection.mutable.WrappedArray;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.List;
import java.util.*;
import java.util.stream.Stream;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@ -50,12 +53,18 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
static JavaSparkContext context;
final String entitiesPath = Paths
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication")).toURI())
.get(
Objects
.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/entities/publication"))
.toURI())
.toFile()
.getAbsolutePath();
final String dedupConfPath = Paths
.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")).toURI())
.get(
Objects
.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))
.toURI())
.toFile()
.getAbsolutePath();
@ -99,9 +108,11 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
Dataset<Relation> simRels = entities
.transform(deduper.dedup())
.distinct()
.map((MapFunction<Row, Relation>) t ->
DedupUtility.createSimRel(t.getStruct(0).getString(0), t.getStruct(0).getString(1), config.getWf().getEntityType()), Encoders.bean(Relation.class)
);
.map(
(MapFunction<Row, Relation>) t -> DedupUtility
.createSimRel(
t.getStruct(0).getString(0), t.getStruct(0).getString(1), config.getWf().getEntityType()),
Encoders.bean(Relation.class));
long simrels_time = System.currentTimeMillis() - before_simrels;
@ -131,7 +142,6 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
System.out.println(r.getSource() + " ---> " + r.getTarget());
}
// resolve connected components
// ("vertexId", "groupId")
Dataset<Row> cliques = ConnectedComponent
@ -155,18 +165,23 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
return res.iterator();
}, Encoders.bean(Relation.class));
long mergerels_time = System.currentTimeMillis() - before_mergerels;
long mergerels_number = mergeRels.count();
long before_dedupentity = System.currentTimeMillis();
final Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(config.getWf().getSubEntityValue()));
final Class<OafEntity> clazz = ModelSupport.entityTypes
.get(EntityType.valueOf(config.getWf().getSubEntityValue()));
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
Dataset<Row> kryoEntities = spark.read().schema(Encoders.bean(clazz).schema()).json(entitiesPath).as(beanEncoder).map(
Dataset<Row> kryoEntities = spark
.read()
.schema(Encoders.bean(clazz).schema())
.json(entitiesPath)
.as(beanEncoder)
.map(
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
return new Tuple2<>(entity.getId(), entity);
},
@ -279,7 +294,8 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
List<Row> rows = Lists.newArrayList(a, b);
Dataset<Row> rowsDS = spark.createDataset(rows, RowEncoder.apply(model.schema()))
Dataset<Row> rowsDS = spark
.createDataset(rows, RowEncoder.apply(model.schema()))
.transform(deduper.filterAndCleanup())
.transform(deduper.generateClustersWithCollect());
@ -296,7 +312,15 @@ public class SparkDedupLocalTest extends DedupLocalTestUtils {
entities,
simRels,
"/tmp/graph.html",
Paths.get(Objects.requireNonNull(SparkDedupLocalTest.class.getResource("/eu/dnetlib/dhp/dedup/visualization_tools/graph_template.html")).toURI()).toFile().getAbsolutePath());
Paths
.get(
Objects
.requireNonNull(
SparkDedupLocalTest.class
.getResource("/eu/dnetlib/dhp/dedup/visualization_tools/graph_template.html"))
.toURI())
.toFile()
.getAbsolutePath());
Desktop.getDesktop().browse(new File("/tmp/graph.html").toURI());
} catch (Exception e) {
e.printStackTrace();

View File

@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest {
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
}
@Test def testDocumentationNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
}