2019-12-06 13:38:00 +01:00
package eu.dnetlib.dedup ;
import com.google.common.collect.Sets ;
2019-12-11 15:43:24 +01:00
import com.wcohen.ss.JaroWinkler ;
import eu.dnetlib.dhp.schema.oaf.Author ;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty ;
2019-12-06 13:38:00 +01:00
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner ;
import eu.dnetlib.pace.config.DedupConfig ;
2019-12-11 15:43:24 +01:00
2019-12-06 13:38:00 +01:00
import eu.dnetlib.pace.model.MapDocument ;
2019-12-11 15:43:24 +01:00
import eu.dnetlib.pace.model.Person ;
2019-12-06 13:38:00 +01:00
import org.apache.commons.codec.binary.Hex ;
import org.apache.commons.io.IOUtils ;
2019-12-11 15:43:24 +01:00
import org.apache.commons.lang3.StringUtils ;
2019-12-06 13:38:00 +01:00
import org.apache.hadoop.conf.Configuration ;
import org.apache.hadoop.fs.FSDataInputStream ;
import org.apache.hadoop.fs.FileSystem ;
import org.apache.hadoop.fs.Path ;
import org.apache.spark.SparkContext ;
import org.apache.spark.api.java.JavaRDD ;
import org.apache.spark.api.java.JavaSparkContext ;
import org.apache.spark.util.LongAccumulator ;
2019-12-11 15:43:24 +01:00
import scala.Tuple2 ;
2019-12-06 13:38:00 +01:00
import java.io.IOException ;
import java.io.StringWriter ;
import java.nio.charset.StandardCharsets ;
import java.security.MessageDigest ;
2019-12-11 15:43:24 +01:00
import java.text.Normalizer ;
import java.util.* ;
import java.util.stream.Collectors ;
import java.util.stream.Stream ;
2019-12-06 13:38:00 +01:00
public class DedupUtility {
2019-12-11 15:43:24 +01:00
private static final Double THRESHOLD = 0 . 95 ;
2019-12-06 13:38:00 +01:00
public static Map < String , LongAccumulator > constructAccumulator ( final DedupConfig dedupConf , final SparkContext context ) {
Map < String , LongAccumulator > accumulators = new HashMap < > ( ) ;
2019-12-11 15:43:24 +01:00
String acc1 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " records per hash key = 1 " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc1 , context . longAccumulator ( acc1 ) ) ;
2019-12-11 15:43:24 +01:00
String acc2 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " missing " + dedupConf . getWf ( ) . getOrderField ( ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc2 , context . longAccumulator ( acc2 ) ) ;
2019-12-11 15:43:24 +01:00
String acc3 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , String . format ( " Skipped records for count(%s) >= %s " , dedupConf . getWf ( ) . getOrderField ( ) , dedupConf . getWf ( ) . getGroupMaxSize ( ) ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc3 , context . longAccumulator ( acc3 ) ) ;
2019-12-11 15:43:24 +01:00
String acc4 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " skip list " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc4 , context . longAccumulator ( acc4 ) ) ;
2019-12-11 15:43:24 +01:00
String acc5 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " dedupSimilarity (x2) " ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc5 , context . longAccumulator ( acc5 ) ) ;
2019-12-11 15:43:24 +01:00
String acc6 = String . format ( " %s::%s " , dedupConf . getWf ( ) . getEntityType ( ) , " d < " + dedupConf . getWf ( ) . getThreshold ( ) ) ;
2019-12-06 13:38:00 +01:00
accumulators . put ( acc6 , context . longAccumulator ( acc6 ) ) ;
return accumulators ;
}
public static JavaRDD < String > loadDataFromHDFS ( String path , JavaSparkContext context ) {
return context . textFile ( path ) ;
}
public static void deleteIfExists ( String path ) throws IOException {
Configuration conf = new Configuration ( ) ;
FileSystem fileSystem = FileSystem . get ( conf ) ;
2019-12-11 15:43:24 +01:00
if ( fileSystem . exists ( new Path ( path ) ) ) {
2019-12-06 13:38:00 +01:00
fileSystem . delete ( new Path ( path ) , true ) ;
}
}
public static DedupConfig loadConfigFromHDFS ( String path ) throws IOException {
Configuration conf = new Configuration ( ) ;
FileSystem fileSystem = FileSystem . get ( conf ) ;
FSDataInputStream inputStream = new FSDataInputStream ( fileSystem . open ( new Path ( path ) ) ) ;
return DedupConfig . load ( IOUtils . toString ( inputStream , StandardCharsets . UTF_8 . name ( ) ) ) ;
}
static < T > String readFromClasspath ( final String filename , final Class < T > clazz ) {
final StringWriter sw = new StringWriter ( ) ;
try {
IOUtils . copy ( clazz . getResourceAsStream ( filename ) , sw ) ;
return sw . toString ( ) ;
} catch ( final IOException e ) {
throw new RuntimeException ( " cannot load resource from classpath: " + filename ) ;
}
}
static Set < String > getGroupingKeys ( DedupConfig conf , MapDocument doc ) {
return Sets . newHashSet ( BlacklistAwareClusteringCombiner . filterAndCombine ( doc , conf ) ) ;
}
public static String md5 ( final String s ) {
try {
final MessageDigest md = MessageDigest . getInstance ( " MD5 " ) ;
md . update ( s . getBytes ( " UTF-8 " ) ) ;
return new String ( Hex . encodeHex ( md . digest ( ) ) ) ;
} catch ( final Exception e ) {
System . err . println ( " Error creating id " ) ;
return null ;
}
}
2019-12-11 15:43:24 +01:00
public static List < Author > mergeAuthor ( final List < Author > a , final List < Author > b ) {
int pa = countAuthorsPids ( a ) ;
int pb = countAuthorsPids ( b ) ;
List < Author > base , enrich ;
int sa = authorsSize ( a ) ;
int sb = authorsSize ( b ) ;
if ( pa = = pb ) {
base = sa > sb ? a : b ;
enrich = sa > sb ? b : a ;
} else {
base = pa > pb ? a : b ;
enrich = pa > pb ? b : a ;
}
enrichPidFromList ( base , enrich ) ;
return base ;
// //if both have no authors with pids
// if (pa < 1 && pb < 1) {
// //B is bigger than A
// if (sa < sb)
// return b;
// //A is bigger than B
// else
// return a;
// }
// //If A has author with pids
// if (pa > 0) {
// //B has no author with pid
// if (pb < 1)
// return a;
// //B has author with pid
// else {
// enrichPidFromList(a, b);
// return a;
// }
// }
// //If B has author with pids
// //A has no author with pid
// if (pa < 1)
// return b;
// //A has author with pid
// else {
// enrichPidFromList(b, a);
// return b;
// }
}
private static void enrichPidFromList ( List < Author > base , List < Author > enrich ) {
if ( base = = null | | enrich = = null )
return ;
final Map < String , Author > basePidAuthorMap = base . stream ( )
. filter ( a - > a . getPid ( ) ! = null & & a . getPid ( ) . size ( ) > 0 )
. flatMap ( a - > a . getPid ( )
. stream ( )
. map ( p - > new Tuple2 < > ( p . toComparableString ( ) , a ) )
) . collect ( Collectors . toMap ( Tuple2 : : _1 , Tuple2 : : _2 ) ) ;
final List < Tuple2 < StructuredProperty , Author > > pidToEnrich = enrich
. stream ( )
. filter ( a - > a . getPid ( ) ! = null & & a . getPid ( ) . size ( ) > 0 )
. flatMap ( a - > a . getPid ( ) . stream ( ) . filter ( p - > ! basePidAuthorMap . containsKey ( p . toComparableString ( ) ) ) . map ( p - > new Tuple2 < > ( p , a ) ) )
. collect ( Collectors . toList ( ) ) ;
pidToEnrich . forEach ( a - > {
Optional < Tuple2 < Double , Author > > simAuhtor = base . stream ( ) . map ( ba - > new Tuple2 < > ( sim ( ba , a . _2 ( ) ) , ba ) ) . max ( Comparator . comparing ( Tuple2 : : _1 ) ) ;
if ( simAuhtor . isPresent ( ) & & simAuhtor . get ( ) . _1 ( ) > THRESHOLD ) {
Author r = simAuhtor . get ( ) . _2 ( ) ;
r . getPid ( ) . add ( a . _1 ( ) ) ;
}
} ) ;
}
public static String createEntityPath ( final String basePath , final String entityType ) {
return String . format ( " %s/%s " , basePath , entityType ) ;
}
public static String createSimRelPath ( final String basePath , final String entityType ) {
return String . format ( " %s/%s_simRel " , basePath , entityType ) ;
}
public static String createMergeRelPath ( final String basePath , final String entityType ) {
return String . format ( " %s/%s_mergeRel " , basePath , entityType ) ;
}
private static Double sim ( Author a , Author b ) {
final Person pa = parse ( a ) ;
final Person pb = parse ( b ) ;
if ( pa . isAccurate ( ) & pb . isAccurate ( ) ) {
return new JaroWinkler ( ) . score (
normalize ( pa . getSurnameString ( ) ) ,
normalize ( pb . getSurnameString ( ) ) ) ;
} else {
return new JaroWinkler ( ) . score (
normalize ( pa . getNormalisedFullname ( ) ) ,
normalize ( pb . getNormalisedFullname ( ) ) ) ;
}
}
private static String normalize ( final String s ) {
return nfd ( s ) . toLowerCase ( )
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
. replaceAll ( " ( \\ W)+ " , " " )
. replaceAll ( " ( \\ p{InCombiningDiacriticalMarks})+ " , " " )
. replaceAll ( " ( \\ p{Punct})+ " , " " )
. replaceAll ( " ( \\ d)+ " , " " )
. replaceAll ( " ( \\ n)+ " , " " )
. trim ( ) ;
}
private static String nfd ( final String s ) {
return Normalizer . normalize ( s , Normalizer . Form . NFD ) ;
}
private static Person parse ( Author author ) {
if ( StringUtils . isNotBlank ( author . getSurname ( ) ) ) {
return new Person ( author . getSurname ( ) + " , " + author . getName ( ) , false ) ;
} else {
return new Person ( author . getFullname ( ) , false ) ;
}
}
private static int countAuthorsPids ( List < Author > authors ) {
if ( authors = = null )
return 0 ;
return ( int ) authors . stream ( ) . map ( DedupUtility : : extractAuthorPid ) . filter ( Objects : : nonNull ) . filter ( StringUtils : : isNotBlank ) . count ( ) ;
}
private static int authorsSize ( List < Author > authors ) {
if ( authors = = null )
return 0 ;
return authors . size ( ) ;
}
private static boolean isAccurate ( final Author a ) {
return StringUtils . isNotBlank ( a . getName ( ) ) & & StringUtils . isNotBlank ( a . getSurname ( ) ) ;
}
private static String extractAuthorPid ( Author a ) {
if ( a = = null | | a . getPid ( ) = = null | | a . getPid ( ) . size ( ) = = 0 )
return null ;
StringBuilder mainPid = new StringBuilder ( ) ;
a . getPid ( ) . forEach ( pid - > {
if ( pid . getQualifier ( ) . getClassid ( ) . equalsIgnoreCase ( " orcid " ) ) {
mainPid . setLength ( 0 ) ;
mainPid . append ( pid . getValue ( ) ) ;
} else {
if ( mainPid . length ( ) = = 0 )
mainPid . append ( pid . getValue ( ) ) ;
}
} ) ;
return mainPid . toString ( ) ;
}
2019-12-06 13:38:00 +01:00
}