dnet-hadoop/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala

package eu.dnetlib.pace.model

import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.MapDocumentUtil
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}

import java.util.Locale
import java.util.regex.Pattern
import scala.collection.JavaConverters._

case class SparkModel(conf: DedupConfig) {
  private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")

  private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")

  val identifierFieldName = "identifier"

  val orderingFieldName = if (!conf.getWf.getOrderField.isEmpty) conf.getWf.getOrderField else identifierFieldName

  val schema: StructType = {
    // create an implicit identifier field
    val identifier = new FieldDef()
    identifier.setName(identifierFieldName)
    identifier.setType(Type.String)

    // Construct a Spark StructType representing the schema of the model
    (Seq(identifier) ++ conf.getPace.getModel.asScala)
      .foldLeft(
        new StructType()
      )((resType, fieldDef) => {
        resType.add(fieldDef.getType match {
          case Type.List | Type.JSON =>
            StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.StringType), true, Metadata.empty)
          case Type.DoubleArray =>
            StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.DoubleType), true, Metadata.empty)
          case _ =>
            StructField(fieldDef.getName, DataTypes.StringType, true, Metadata.empty)
        })
      })


  }

  val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)

  val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)

  val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
    df.map(r => rowFromJson(r))(RowEncoder(schema))
  }

  def rowFromJson(json: String): Row = {
    val documentContext =
      JsonPath.using(Configuration.defaultConfiguration.addOptions(com.jayway.jsonpath.Option.SUPPRESS_EXCEPTIONS)).parse(json)
    val values = new Array[Any](schema.size)

    values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)

    schema.fieldNames.zipWithIndex.foldLeft(values) {
      case ((res, (fname, index))) =>
        val fdef = conf.getPace.getModelMap.get(fname)

        if (fdef != null) {
          res(index) = fdef.getType match {
            case Type.String | Type.Int =>
              MapDocumentUtil.truncateValue(
                MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
                fdef.getLength
              )

            case Type.URL =>
              var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
              if (!URL_REGEX.matcher(uv).matches)
                uv = ""
              uv

            case Type.List | Type.JSON =>
              MapDocumentUtil.truncateList(
                MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
                fdef.getSize
              ).asScala

            case Type.StringConcat =>
              val jpaths = CONCAT_REGEX.split(fdef.getPath)

              MapDocumentUtil.truncateValue(
                jpaths
                  .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
                  .mkString(" "),
                fdef.getLength
              )

            case Type.DoubleArray =>
              MapDocumentUtil.getJPathArray(fdef.getPath, json)
          }

          val filter = fdef.getFilter

          if (StringUtils.isNotBlank(fdef.getClean)) {
            res(index) = res(index) match {
              case x: Seq[String] => x.map(clean(_, fdef.getClean)).toSeq
              case _ => clean(res(index).toString, fdef.getClean)
            }
          }

          if (filter != null && !filter.isEmpty) {
            res(index) = res(index) match {
              case x: String if filter.contains(x.toLowerCase(Locale.ROOT)) => null
              case x: Seq[String] => x.filter(s => !filter.contains(s.toLowerCase(Locale.ROOT))).toSeq
              case _ => res(index)
            }
          }

          if (fdef.getSorted) {
            res(index) = res(index) match {
              case x: Seq[String] => x.sorted.toSeq
              case _ => res(index)
            }
          }
        }

        res
    }

    new GenericRowWithSchema(values, schema)
  }

  def clean(value: String, cleantype: String) : String = {
    val res = cleantype match {
      case "title" => AbstractPaceFunctions.cleanup(value)
      case _ => value
    }

//    if (!res.equals(AbstractPaceFunctions.normalize(value))) {
//      println(res)
//      println(AbstractPaceFunctions.normalize(value))
//      println()
//    }

    res
  }

}
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`package eu.dnetlib.pace.model`

			`import com.jayway.jsonpath.{Configuration, JsonPath}`
SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00			`import eu.dnetlib.pace.common.AbstractPaceFunctions`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`import eu.dnetlib.pace.config.{DedupConfig, Type}`
			`import eu.dnetlib.pace.util.MapDocumentUtil`
SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00			`import org.apache.commons.lang3.StringUtils`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`import org.apache.spark.sql.catalyst.encoders.RowEncoder`
			`import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema`
			`import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}`
			`import org.apache.spark.sql.{Dataset, Row}`

SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00			`import java.util.Locale`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`import java.util.regex.Pattern`
			`import scala.collection.JavaConverters._`

			`case class SparkModel(conf: DedupConfig) {`
			`private val URL_REGEX: Pattern = Pattern.compile("^\\s(http\|https\|ftp)\\://.")`

			`private val CONCAT_REGEX: Pattern = Pattern.compile("\\\|\\\|\\\|")`

			`val identifierFieldName = "identifier"`

			`val orderingFieldName = if (!conf.getWf.getOrderField.isEmpty) conf.getWf.getOrderField else identifierFieldName`

			`val schema: StructType = {`
			`// create an implicit identifier field`
			`val identifier = new FieldDef()`
			`identifier.setName(identifierFieldName)`
			`identifier.setType(Type.String)`

			`// Construct a Spark StructType representing the schema of the model`
			`(Seq(identifier) ++ conf.getPace.getModel.asScala)`
			`.foldLeft(`
			`new StructType()`
			`)((resType, fieldDef) => {`
			`resType.add(fieldDef.getType match {`
			`case Type.List \| Type.JSON =>`
			`StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.StringType), true, Metadata.empty)`
			`case Type.DoubleArray =>`
			`StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.DoubleType), true, Metadata.empty)`
			`case _ =>`
			`StructField(fieldDef.getName, DataTypes.StringType, true, Metadata.empty)`
			`})`
			`})`


			`}`

			`val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)`

			`val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)`

			`val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {`
			`df.map(r => rowFromJson(r))(RowEncoder(schema))`
			`}`

			`def rowFromJson(json: String): Row = {`
			`val documentContext =`
			`JsonPath.using(Configuration.defaultConfiguration.addOptions(com.jayway.jsonpath.Option.SUPPRESS_EXCEPTIONS)).parse(json)`
			`val values = new Array[Any](schema.size)`

			`values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)`

			`schema.fieldNames.zipWithIndex.foldLeft(values) {`
SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00			`case ((res, (fname, index))) =>`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`val fdef = conf.getPace.getModelMap.get(fname)`

			`if (fdef != null) {`
			`res(index) = fdef.getType match {`
			`case Type.String \| Type.Int =>`
			`MapDocumentUtil.truncateValue(`
			`MapDocumentUtil.getJPathString(fdef.getPath, documentContext),`
			`fdef.getLength`
			`)`

			`case Type.URL =>`
			`var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)`
			`if (!URL_REGEX.matcher(uv).matches)`
			`uv = ""`
			`uv`

			`case Type.List \| Type.JSON =>`
Use asScala to convert java List to Scala Sequence 2023-09-20 16:14:01 +02:00			`MapDocumentUtil.truncateList(`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),`
			`fdef.getSize`
Use asScala to convert java List to Scala Sequence 2023-09-20 16:14:01 +02:00			`).asScala`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00
			`case Type.StringConcat =>`
			`val jpaths = CONCAT_REGEX.split(fdef.getPath)`

			`MapDocumentUtil.truncateValue(`
			`jpaths`
			`.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))`
			`.mkString(" "),`
			`fdef.getLength`
			`)`

			`case Type.DoubleArray =>`
			`MapDocumentUtil.getJPathArray(fdef.getPath, json)`
			`}`
SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00
			`val filter = fdef.getFilter`

			`if (StringUtils.isNotBlank(fdef.getClean)) {`
			`res(index) = res(index) match {`
			`case x: Seq[String] => x.map(clean(_, fdef.getClean)).toSeq`
			`case _ => clean(res(index).toString, fdef.getClean)`
			`}`
			`}`

			`if (filter != null && !filter.isEmpty) {`
			`res(index) = res(index) match {`
			`case x: String if filter.contains(x.toLowerCase(Locale.ROOT)) => null`
			`case x: Seq[String] => x.filter(s => !filter.contains(s.toLowerCase(Locale.ROOT))).toSeq`
			`case _ => res(index)`
			`}`
			`}`

			`if (fdef.getSorted) {`
			`res(index) = res(index) match {`
			`case x: Seq[String] => x.sorted.toSeq`
			`case _ => res(index)`
			`}`
			`}`
Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`}`

			`res`
			`}`

			`new GenericRowWithSchema(values, schema)`
			`}`
SparkCreateSimRels: - Create dedup blocks from the complete queue of records matching cluster key instead of truncating the results - Clean titles once before clustering and similarity comparisons - Added support for filtered fields in model - Added support for sorting List fields in model - Added new JSONListClustering and numAuthorsTitleSuffixPrefixChain clustering functions - Added new maxLengthMatch comparator function - Use reduced complexity Levenshtein with threshold in levensteinTitle - Use reduced complexity AuthorsMatch with threshold early-quit - Use incremental Connected Component to decrease comparisons in similarity match in BlockProcessor - Use new clusterings configuration in Dedup tests SparkWhitelistSimRels: use left semi join for clarity and performance SparkCreateMergeRels: - Use new connected component algorithm that converge faster than Spark GraphX provided algorithm - Refactored to use Windowing sorting rather than groupBy to reduce memory pressure - Use historical pivot table to generate singleton rels, merged rels and keep continuity with dedupIds used in the past - Comparator for pivot record selection now uses "tomorrow" as filler for missing or incorrect date instead of "2000-01-01" - Changed generation of ids of type dedup_wf_001 to avoid collisions DedupRecordFactory: use reduceGroups instead of mapGroups to decrease memory pressure 2023-10-02 09:25:12 +02:00
			`def clean(value: String, cleantype: String) : String = {`
			`val res = cleantype match {`
			`case "title" => AbstractPaceFunctions.cleanup(value)`
			`case _ => value`
			`}`

			`// if (!res.equals(AbstractPaceFunctions.normalize(value))) {`
			`// println(res)`
			`// println(AbstractPaceFunctions.normalize(value))`
			`// println()`
			`// }`

			`res`
			`}`

Refactor Dedup process to use Spark Dataframe API and intermediate representation with Row interface JsonPath cache contention fixed by using a ConcurrentHashMap Blacklist filtering performance improvement Minor performance improvements when evaluating similarity Sorting in clustered elements is deterministic (by ordering and identity field, instead of ordering field only) 2023-07-18 11:38:56 +02:00			`}`