blacklist filtering moved before the cleanup phase in order to have case sensitive regex #485

Merged
claudio.atzori merged 1 commits from dedup_blacklist_fix into beta 2024-10-28 09:42:52 +01:00
8 changed files with 111 additions and 75 deletions
Showing only changes of commit 6df6b4583e - Show all commits

View File

@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction
@Override @Override
protected Collection<String> doApply(Config conf, String s) { protected Collection<String> doApply(Config conf, String s) {
return suffixPrefixChain(cleanup(s), param("mod")); return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
} }
private Collection<String> suffixPrefixChain(String s, int mod) { private Collection<String> suffixPrefixChain(String s, int mod) {

View File

@ -54,6 +54,22 @@ public class FieldDef implements Serializable {
public FieldDef() { public FieldDef() {
} }
public FieldDef clone() {
FieldDef fieldDef = new FieldDef();
fieldDef.setName(this.name);
fieldDef.setPath(this.path);
fieldDef.setType(this.type);
fieldDef.setOverrideMatch(this.overrideMatch);
fieldDef.setSize(this.size);
fieldDef.setLength(this.length);
fieldDef.setFilter(this.filter);
fieldDef.setSorted(this.sorted);
fieldDef.setClean(this.clean);
fieldDef.setInfer(this.infer);
fieldDef.setInferenceFrom(this.inferenceFrom);
return fieldDef;
}
public String getInferenceFrom() { public String getInferenceFrom() {
return inferenceFrom; return inferenceFrom;
} }

View File

@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
val model: SparkModel = SparkModel(conf) val model: SparkModel = SparkModel(conf)
val dedup: (Dataset[Row] => Dataset[Row]) = df => { val dedup: (Dataset[Row] => Dataset[Row]) = df => {
df.transform(filterAndCleanup) df.transform(generateClustersWithCollect)
.transform(generateClustersWithCollect)
.transform(processBlocks) .transform(processBlocks)
} }
val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
if (conf.blacklists.containsKey(fdef.getName)) {
res.withColumn(
fdef.getName + "_filtered",
filterColumnUDF(fdef).apply(new Column(fdef.getName))
)
} else {
res
}
})
df_with_filters
}
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
if (blacklist == null) {
throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
} else {
fdef.getType match {
case Type.List | Type.JSON =>
udf[Array[String], Array[String]](values => {
values.filter((v: String) => !blacklist.test(v))
})
case _ =>
udf[String, String](v => {
if (blacklist.test(v)) ""
else v
})
}
}
}
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => { val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
var df_with_clustering_keys: Dataset[Row] = null var df_with_clustering_keys: Dataset[Row] = null

View File

@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.{Dataset, Row}
import java.util.Locale import java.util.Locale
import java.util.function.Predicate
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
identifier.setName(identifierFieldName) identifier.setName(identifierFieldName)
identifier.setType(Type.String) identifier.setType(Type.String)
// create fields for blacklist
val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
if (conf.blacklists().containsKey(fdef.getName)) {
val fdef_filtered = fdef.clone()
fdef_filtered.setName(fdef.getName + "_filtered")
Seq(fdef, fdef_filtered)
}
else {
Seq(fdef)
}
})
// Construct a Spark StructType representing the schema of the model // Construct a Spark StructType representing the schema of the model
(Seq(identifier) ++ conf.getPace.getModel.asScala) (Seq(identifier) ++ filtered)
.foldLeft( .foldLeft(
new StructType() new StructType()
)((resType, fieldDef) => { )((resType, fieldDef) => {
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
}) })
}) })
} }
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName) val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) df
.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
} }
def rowFromJson(json: String): Row = { def rowFromJson(json: String): Row = {
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {
schema.fieldNames.zipWithIndex.foldLeft(values) { schema.fieldNames.zipWithIndex.foldLeft(values) {
case ((res, (fname, index))) => case ((res, (fname, index))) =>
val fdef = conf.getPace.getModelMap.get(fname)
val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))
if (fdef != null) { if (fdef != null) {
res(index) = fdef.getType match { if (!fname.contains("_filtered")) { //process fields with no blacklist
case Type.String | Type.Int => res(index) = fdef.getType match {
MapDocumentUtil.truncateValue( case Type.String | Type.Int =>
MapDocumentUtil.getJPathString(fdef.getPath, documentContext), MapDocumentUtil.truncateValue(
fdef.getLength MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
) fdef.getLength
)
case Type.URL => case Type.URL =>
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext) var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
if (!URL_REGEX.matcher(uv).matches) if (!URL_REGEX.matcher(uv).matches)
uv = "" uv = ""
uv uv
case Type.List | Type.JSON => case Type.List | Type.JSON =>
MapDocumentUtil.truncateList( MapDocumentUtil.truncateList(
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
fdef.getSize fdef.getSize
).asScala ).asScala
case Type.StringConcat => case Type.StringConcat =>
val jpaths = CONCAT_REGEX.split(fdef.getPath) val jpaths = CONCAT_REGEX.split(fdef.getPath)
MapDocumentUtil.truncateValue( MapDocumentUtil.truncateValue(
jpaths jpaths
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext)) .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
.mkString(" "), .mkString(" "),
fdef.getLength fdef.getLength
) )
case Type.DoubleArray => case Type.DoubleArray =>
MapDocumentUtil.getJPathArray(fdef.getPath, json) MapDocumentUtil.getJPathArray(fdef.getPath, json)
}
} }
else { //process fields with blacklist
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
res(index) = fdef.getType match {
case Type.List | Type.JSON =>
MapDocumentUtil.truncateList(
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
fdef.getSize
).asScala.filter((v: String) => !blacklist.test(v))
case _ =>
val value: String = MapDocumentUtil.truncateValue(
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
fdef.getLength
)
if (blacklist.test(value)) "" else value
}
}
val filter = fdef.getFilter val filter = fdef.getFilter
@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) {
} }
if (StringUtils.isNotBlank(fdef.getInfer)) { if (StringUtils.isNotBlank(fdef.getInfer)) {
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
res(index) = res(index) match { res(index) = res(index) match {
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)) case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer) case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
} }
} }
} }
res res
@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) {
} }
new GenericRowWithSchema(values, schema) new GenericRowWithSchema(values, schema)
} }
def clean(value: String, cleantype: String) : String = { def clean(value: String, cleantype: String) : String = {

View File

@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
} }
@Test
public void testNumAuthorsTitleSuffixPrefixChain() {
final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
params.put("mod", 10);
final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
final String num_authors = "10";
System.out.println("title = " + title);
System.out.println("num_authors = " + num_authors);
System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
}
} }

View File

@ -327,4 +327,16 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test
public void titleVersionMatchTest() {
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result = titleVersionMatch
.compare(
"parp 2 regulates sirt 1 expression and whole body energy expenditure",
"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
assertEquals(1.0, result);
}
} }

View File

@ -91,7 +91,6 @@ public class SparkBlockStats extends AbstractSparkAction {
.read() .read()
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
.transform(deduper.model().parseJsonDataset()) .transform(deduper.model().parseJsonDataset())
.transform(deduper.filterAndCleanup())
.transform(deduper.generateClustersWithCollect()) .transform(deduper.generateClustersWithCollect())
.filter(functions.size(new Column("block")).geq(1)); .filter(functions.size(new Column("block")).geq(1));

View File

@ -96,7 +96,7 @@
"aggregation": "MAX", "aggregation": "MAX",
"positive": "layer4", "positive": "layer4",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "MATCH", "undefined": "layer4",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"layer4": { "layer4": {