small changes

This commit is contained in:
Giambattista Bloisi 2023-07-04 18:36:58 +02:00
parent 890b49fb5d
commit df19548c56
5 changed files with 14 additions and 12 deletions

View File

@ -67,7 +67,7 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s1 = HTML_REGEX.matcher(s).replaceAll( "");
final String s1 = HTML_REGEX.matcher(s).replaceAll("");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);

View File

@ -91,7 +91,7 @@ case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Seria
} else {
res
}
})
}).checkpoint()
var relBlocks: Dataset[Row] = null
@ -178,8 +178,8 @@ case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Seria
val res = relBlocks.filter(col("match").equalTo(true))
.select(col("l.identifier").as("from"), col("r.identifier").as("to"))
.repartition()
.dropDuplicates()
//.repartition()
.distinct()
// res.show(false)
res.select(functions.struct("from", "to"))

View File

@ -6,19 +6,19 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.Option;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
import com.jayway.jsonpath.JsonPath;
@ComparatorClass("jsonListMatch")
public class JsonListMatch extends AbstractListComparator {
@ -63,7 +63,9 @@ public class JsonListMatch extends AbstractListComparator {
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
// parameters
final DocumentContext documentContext = JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json);
final DocumentContext documentContext = JsonPath
.using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS))
.parse(json);
// for each path in the param list
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);

View File

@ -89,7 +89,7 @@ public class BlockProcessor {
break;
}
if (i > wf.getSlidingWindowSize()) {
if (++i > wf.getSlidingWindowSize()) {
break;
}

View File

@ -91,10 +91,10 @@ public class SparkCreateSimRels extends AbstractSparkAction {
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
.transform(sparkConfig.modelExtractor()) // Extract fields from input json column according to model
// definition
.transform(sparkConfig.generateClustersWithDFAPI()) // generate <key,block> pairs according to
.transform(sparkConfig.generateAndProcessClustersWithJoins()) // generate <key,block> pairs according to
// filters, clusters, and model
// definition
.transform(sparkConfig.processClusters()) // process blocks and emits <from,to> pairs of found
// .transform(sparkConfig.processClusters()) // process blocks and emits <from,to> pairs of found
// similarities
.map(
(MapFunction<Row, Relation>) t -> DedupUtility