2020-03-27 10:42:17 +01:00
package eu.dnetlib.dhp.oa.dedup ;
2019-12-06 13:38:00 +01:00
import com.google.common.collect.Sets ;
2019-12-11 15:43:24 +01:00
import com.wcohen.ss.JaroWinkler ;
import eu.dnetlib.dhp.schema.oaf.Author ;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty ;
2020-03-19 15:01:07 +01:00
import eu.dnetlib.dhp.utils.ISLookupClientFactory ;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException ;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService ;
2019-12-06 13:38:00 +01:00
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner ;
import eu.dnetlib.pace.config.DedupConfig ;
2019-12-11 15:43:24 +01:00
2019-12-06 13:38:00 +01:00
import eu.dnetlib.pace.model.MapDocument ;
2019-12-11 15:43:24 +01:00
import eu.dnetlib.pace.model.Person ;
2019-12-06 13:38:00 +01:00
import org.apache.commons.codec.binary.Hex ;
2019-12-11 15:43:24 +01:00
import org.apache.commons.lang3.StringUtils ;
2019-12-06 13:38:00 +01:00
import org.apache.spark.SparkContext ;
import org.apache.spark.util.LongAccumulator ;
2020-03-19 15:01:07 +01:00
import org.dom4j.Document ;
import org.dom4j.DocumentException ;
import org.dom4j.Element ;
import org.dom4j.io.SAXReader ;
2019-12-11 15:43:24 +01:00
import scala.Tuple2 ;
2019-12-06 13:38:00 +01:00
2020-03-19 15:01:07 +01:00
import java.io.StringReader ;
2019-12-06 13:38:00 +01:00
import java.security.MessageDigest ;
2019-12-11 15:43:24 +01:00
import java.text.Normalizer ;
import java.util.* ;
import java.util.stream.Collectors ;
2019-12-06 13:38:00 +01:00
public class DedupUtility {
2019-12-11 15:43:24 +01:00
private static final Double THRESHOLD = 0 . 95 ;
2019-12-06 13:38:00 +01:00
public static Map < String , LongAccumulator > constructAccumulator ( final DedupConfig dedupConf , final SparkContext context ) {
Map < String , LongAccumulator > accumulators = new HashMap < > ( ) ;
2019-12-11 15:43:24 +01:00
String acc1 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " records per hash key = 1 " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc1 , context . longAccumulator ( acc1 ) ) ;
2019-12-11 15:43:24 +01:00
String acc2 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " missing " + dedupConf . getWf ( ) . getOrderField ( ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc2 , context . longAccumulator ( acc2 ) ) ;
2019-12-11 15:43:24 +01:00
String acc3 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , String . format ( " Skipped records for count(%s) >= %s " , dedupConf . getWf ( ) . getOrderField ( ) , dedupConf . getWf ( ) . getGroupMaxSize ( ) ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc3 , context . longAccumulator ( acc3 ) ) ;
2019-12-11 15:43:24 +01:00
String acc4 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " skip list " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc4 , context . longAccumulator ( acc4 ) ) ;
2019-12-11 15:43:24 +01:00
String acc5 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " dedupSimilarity (x2) " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc5 , context . longAccumulator ( acc5 ) ) ;
2019-12-11 15:43:24 +01:00
String acc6 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " d < " + dedupConf . getWf ( ) . getThreshold ( ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc6 , context . longAccumulator ( acc6 ) ) ;
return accumulators ;
}
static Set < String > getGroupingKeys ( DedupConfig conf , MapDocument doc ) {
return Sets . newHashSet ( BlacklistAwareClusteringCombiner . filterAndCombine ( doc , conf ) ) ;
}
public static String md5 ( final String s ) {
try {
final MessageDigest md = MessageDigest . getInstance ( " MD5 " ) ;
md . update ( s . getBytes ( " UTF-8 " ) ) ;
return new String ( Hex . encodeHex ( md . digest ( ) ) ) ;
} catch ( final Exception e ) {
System . err . println ( " Error creating id " ) ;
return null ;
}
}
2019-12-11 15:43:24 +01:00
public static List < Author > mergeAuthor ( final List < Author > a , final List < Author > b ) {
int pa = countAuthorsPids ( a ) ;
int pb = countAuthorsPids ( b ) ;
List < Author > base , enrich ;
int sa = authorsSize ( a ) ;
int sb = authorsSize ( b ) ;
2019-12-12 15:18:48 +01:00
if ( pa = = pb ) {
base = sa > sb ? a : b ;
enrich = sa > sb ? b : a ;
2019-12-11 15:43:24 +01:00
} else {
2019-12-12 15:18:48 +01:00
base = pa > pb ? a : b ;
enrich = pa > pb ? b : a ;
2019-12-11 15:43:24 +01:00
}
enrichPidFromList ( base , enrich ) ;
return base ;
}
private static void enrichPidFromList ( List < Author > base , List < Author > enrich ) {
2019-12-12 15:18:48 +01:00
if ( base = = null | | enrich = = null )
2019-12-11 15:43:24 +01:00
return ;
final Map < String , Author > basePidAuthorMap = base . stream ( )
. filter ( a - > a . getPid ( ) ! = null & & a . getPid ( ) . size ( ) > 0 )
. flatMap ( a - > a . getPid ( )
. stream ( )
. map ( p - > new Tuple2 < > ( p . toComparableString ( ) , a ) )
2019-12-12 15:18:48 +01:00
) . collect ( Collectors . toMap ( Tuple2 : : _1 , Tuple2 : : _2 , ( x1 , x2 ) - > x1 ) ) ;
2019-12-11 15:43:24 +01:00
final List < Tuple2 < StructuredProperty , Author > > pidToEnrich = enrich
. stream ( )
. filter ( a - > a . getPid ( ) ! = null & & a . getPid ( ) . size ( ) > 0 )
. flatMap ( a - > a . getPid ( ) . stream ( ) . filter ( p - > ! basePidAuthorMap . containsKey ( p . toComparableString ( ) ) ) . map ( p - > new Tuple2 < > ( p , a ) ) )
. collect ( Collectors . toList ( ) ) ;
pidToEnrich . forEach ( a - > {
Optional < Tuple2 < Double , Author > > simAuhtor = base . stream ( ) . map ( ba - > new Tuple2 < > ( sim ( ba , a . _2 ( ) ) , ba ) ) . max ( Comparator . comparing ( Tuple2 : : _1 ) ) ;
2019-12-12 15:18:48 +01:00
if ( simAuhtor . isPresent ( ) & & simAuhtor . get ( ) . _1 ( ) > THRESHOLD ) {
2019-12-11 15:43:24 +01:00
Author r = simAuhtor . get ( ) . _2 ( ) ;
r . getPid ( ) . add ( a . _1 ( ) ) ;
}
} ) ;
}
2020-03-20 13:01:56 +01:00
public static String createDedupRecordPath ( final String basePath , final String actionSetId , final String entityType ) {
return String . format ( " %s/%s/%s_deduprecord " , basePath , actionSetId , entityType ) ;
}
2019-12-11 15:43:24 +01:00
public static String createEntityPath ( final String basePath , final String entityType ) {
2019-12-12 15:18:48 +01:00
return String . format ( " %s/%s " , basePath , entityType ) ;
2019-12-11 15:43:24 +01:00
}
2020-03-19 15:01:07 +01:00
public static String createSimRelPath ( final String basePath , final String actionSetId , final String entityType ) {
return String . format ( " %s/%s/%s_simrel " , basePath , actionSetId , entityType ) ;
2019-12-11 15:43:24 +01:00
}
2020-03-19 15:01:07 +01:00
public static String createMergeRelPath ( final String basePath , final String actionSetId , final String entityType ) {
return String . format ( " %s/%s/%s_mergerel " , basePath , actionSetId , entityType ) ;
2019-12-11 15:43:24 +01:00
}
private static Double sim ( Author a , Author b ) {
final Person pa = parse ( a ) ;
final Person pb = parse ( b ) ;
if ( pa . isAccurate ( ) & pb . isAccurate ( ) ) {
return new JaroWinkler ( ) . score (
normalize ( pa . getSurnameString ( ) ) ,
normalize ( pb . getSurnameString ( ) ) ) ;
} else {
return new JaroWinkler ( ) . score (
normalize ( pa . getNormalisedFullname ( ) ) ,
normalize ( pb . getNormalisedFullname ( ) ) ) ;
}
}
private static String normalize ( final String s ) {
return nfd ( s ) . toLowerCase ( )
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
. replaceAll ( " ( \\ W)+ " , " " )
. replaceAll ( " ( \\ p{InCombiningDiacriticalMarks})+ " , " " )
. replaceAll ( " ( \\ p{Punct})+ " , " " )
. replaceAll ( " ( \\ d)+ " , " " )
. replaceAll ( " ( \\ n)+ " , " " )
. trim ( ) ;
}
private static String nfd ( final String s ) {
return Normalizer . normalize ( s , Normalizer . Form . NFD ) ;
}
2019-12-12 15:18:48 +01:00
2019-12-11 15:43:24 +01:00
private static Person parse ( Author author ) {
if ( StringUtils . isNotBlank ( author . getSurname ( ) ) ) {
return new Person ( author . getSurname ( ) + " , " + author . getName ( ) , false ) ;
} else {
return new Person ( author . getFullname ( ) , false ) ;
}
}
private static int countAuthorsPids ( List < Author > authors ) {
if ( authors = = null )
return 0 ;
2019-12-12 15:18:48 +01:00
return ( int ) authors . stream ( ) . filter ( DedupUtility : : hasPid ) . count ( ) ;
2019-12-11 15:43:24 +01:00
}
private static int authorsSize ( List < Author > authors ) {
if ( authors = = null )
return 0 ;
return authors . size ( ) ;
}
2019-12-12 15:18:48 +01:00
private static boolean hasPid ( Author a ) {
2019-12-11 15:43:24 +01:00
if ( a = = null | | a . getPid ( ) = = null | | a . getPid ( ) . size ( ) = = 0 )
2019-12-12 15:18:48 +01:00
return false ;
return a . getPid ( ) . stream ( ) . anyMatch ( p - > p ! = null & & StringUtils . isNotBlank ( p . getValue ( ) ) ) ;
2019-12-11 15:43:24 +01:00
}
2020-03-19 15:01:07 +01:00
public static List < DedupConfig > getConfigurations ( String isLookUpUrl , String orchestrator ) throws ISLookUpException , DocumentException {
final ISLookUpService isLookUpService = ISLookupClientFactory . getLookUpService ( isLookUpUrl ) ;
final String xquery = String . format ( " /RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s'] " , orchestrator ) ;
String orchestratorProfile = isLookUpService . getResourceProfileByQuery ( xquery ) ;
final Document doc = new SAXReader ( ) . read ( new StringReader ( orchestratorProfile ) ) ;
final String actionSetId = doc . valueOf ( " //DEDUPLICATION/ACTION_SET/@id " ) ;
final List < DedupConfig > configurations = new ArrayList < > ( ) ;
for ( final Object o : doc . selectNodes ( " //SCAN_SEQUENCE/SCAN " ) ) {
configurations . add ( loadConfig ( isLookUpService , actionSetId , o ) ) ;
}
return configurations ;
}
private static DedupConfig loadConfig ( final ISLookUpService isLookUpService , final String actionSetId , final Object o )
throws ISLookUpException {
final Element s = ( Element ) o ;
final String configProfileId = s . attributeValue ( " id " ) ;
final String conf =
isLookUpService . getResourceProfileByQuery ( String . format (
" for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text() " ,
configProfileId ) ) ;
final DedupConfig dedupConfig = DedupConfig . load ( conf ) ;
dedupConfig . getWf ( ) . setConfigurationId ( actionSetId ) ;
return dedupConfig ;
}
2019-12-06 13:38:00 +01:00
}