2020-04-30 11:05:17 +02:00
2020-02-18 17:23:34 +01:00
package eu.dnetlib.dhp ;
2021-10-29 11:20:03 +02:00
import java.util.ArrayList ;
2020-05-07 18:22:26 +02:00
import java.util.List ;
import java.util.Optional ;
2020-04-30 11:05:17 +02:00
2020-04-27 10:40:26 +02:00
import org.apache.spark.api.java.function.MapFunction ;
2020-05-07 18:22:26 +02:00
import org.apache.spark.sql.Dataset ;
2020-04-27 10:40:26 +02:00
import org.apache.spark.sql.Encoders ;
import org.apache.spark.sql.Row ;
import org.apache.spark.sql.SparkSession ;
2020-02-18 17:24:40 +01:00
2020-04-30 11:05:17 +02:00
import com.fasterxml.jackson.databind.ObjectMapper ;
import eu.dnetlib.dhp.application.ArgumentApplicationParser ;
import eu.dnetlib.dhp.common.HdfsSupport ;
2020-05-15 10:21:09 +02:00
import eu.dnetlib.dhp.schema.common.ModelConstants ;
2021-08-11 12:13:22 +02:00
import eu.dnetlib.dhp.schema.oaf.Country ;
import eu.dnetlib.dhp.schema.oaf.DataInfo ;
import eu.dnetlib.dhp.schema.oaf.Qualifier ;
import eu.dnetlib.dhp.schema.oaf.Relation ;
2020-04-30 11:05:17 +02:00
2020-02-18 17:23:34 +01:00
public class PropagationConstant {
2021-08-11 12:13:22 +02:00
private PropagationConstant ( ) {
}
2021-11-09 12:07:19 +01:00
public static final String DOI = " doi " ;
public static final String REF_DOI = " .refs " ;
2021-11-09 11:25:41 +01:00
public static final String UPDATE_DATA_INFO_TYPE = " update " ;
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = " subject:fos " ;
2021-11-10 17:00:37 +01:00
public static final String UPDATE_CLASS_NAME = " Inferred by OpenAIRE " ;
2021-11-09 11:25:41 +01:00
public static final String UPDATE_MEASURE_BIP_CLASS_ID = " measure:bip " ;
2021-11-10 17:00:37 +01:00
2021-11-09 12:07:19 +01:00
public static final String FOS_CLASS_ID = " FOS " ;
2021-11-10 17:00:37 +01:00
public static final String FOS_CLASS_NAME = " Fields of Science and Technology classification " ;
2021-11-09 11:25:41 +01:00
2021-11-09 12:07:19 +01:00
public static final String OPENCITATIONS_CLASSID = " sysimport:crosswalk:opencitations " ;
public static final String OPENCITATIONS_CLASSNAME = " Imported from OpenCitations " ;
public static final String ID_PREFIX = " 50|doi_________:: " ;
public static final String OC_TRUST = " 0.91 " ;
2021-11-09 11:25:41 +01:00
2021-11-09 12:07:19 +01:00
public final static String NULL = " NULL " ;
2021-11-09 11:25:41 +01:00
2020-04-30 11:05:17 +02:00
public static final String INSTITUTIONAL_REPO_TYPE = " pubsrepository::institutional " ;
public static final String PROPAGATION_DATA_INFO_TYPE = " propagation " ;
public static final String TRUE = " true " ;
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = " country:instrepos " ;
public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = " Propagation of country to result collected from datasources of type institutional repositories " ;
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = " result:organization:instrepo " ;
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = " Propagation of affiliation to result collected from datasources of type institutional repository " ;
2021-10-29 11:20:03 +02:00
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID = " result:organization:semrel " ;
public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_NAME = " Propagation of affiliation to result through sematic relations " ;
2020-04-30 11:05:17 +02:00
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = " result:project:semrel " ;
public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = " Propagation of result to project through semantic relation " ;
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = " result:community:semrel " ;
public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation " ;
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = " result:community:organization " ;
public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization " ;
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = " authorpid:result " ;
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = " Propagation of authors pid to result through semantic relations " ;
2021-10-29 11:20:03 +02:00
public static final String ITERATION_ONE = " ExitAtFirstIteration " ;
public static final String ITERATION_TWO = " ExitAtSecondIteration " ;
public static final String ITERATION_THREE = " ExitAtThirdIteration " ;
public static final String ITERATION_FOUR = " ExitAtFourthIteration " ;
public static final String ITERATION_FIVE = " ExitAtFifthIteration " ;
public static final String ITERATION_NO_PARENT = " ExitAtNoFirstParentReached " ;
2020-04-30 11:05:17 +02:00
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper ( ) ;
2020-05-07 18:22:26 +02:00
private static final String cfHbforResultQuery = " select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+
" from result r " +
" lateral view explode(instance) i as inst " +
" where r.datainfo.deletedbyinference=false " ;
2020-04-30 11:05:17 +02:00
public static Country getCountry ( String classid , String classname ) {
Country nc = new Country ( ) ;
nc . setClassid ( classid ) ;
nc . setClassname ( classname ) ;
2020-05-14 18:22:50 +02:00
nc . setSchemename ( ModelConstants . DNET_COUNTRY_TYPE ) ;
nc . setSchemeid ( ModelConstants . DNET_COUNTRY_TYPE ) ;
2020-04-30 11:05:17 +02:00
nc
. setDataInfo (
getDataInfo (
PROPAGATION_DATA_INFO_TYPE ,
PROPAGATION_COUNTRY_INSTREPO_CLASS_ID ,
2021-09-15 18:44:54 +02:00
PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME ,
2021-10-12 08:11:53 +02:00
ModelConstants . DNET_PROVENANCE_ACTIONS ) ) ;
2020-04-30 11:05:17 +02:00
return nc ;
}
public static DataInfo getDataInfo (
2021-09-15 18:44:54 +02:00
String inference_provenance , String inference_class_id , String inference_class_name , String qualifierSchema ) {
2021-11-09 11:25:41 +01:00
return getDataInfo ( inference_provenance , inference_class_id , inference_class_name , qualifierSchema , " 0.85 " ) ;
}
public static DataInfo getDataInfo (
2021-11-10 17:00:37 +01:00
String inference_provenance , String inference_class_id , String inference_class_name , String qualifierSchema ,
String trust ) {
return getDataInfo (
inference_provenance , inference_class_id , inference_class_name , qualifierSchema , trust , true ) ;
2021-11-09 12:07:19 +01:00
}
public static DataInfo getDataInfo (
2021-11-10 17:00:37 +01:00
String inference_provenance , String inference_class_id , String inference_class_name , String qualifierSchema ,
String trust , boolean inferred ) {
2020-04-30 11:05:17 +02:00
DataInfo di = new DataInfo ( ) ;
2021-11-09 12:07:19 +01:00
di . setInferred ( inferred ) ;
2020-04-30 11:05:17 +02:00
di . setDeletedbyinference ( false ) ;
2021-11-09 11:25:41 +01:00
di . setTrust ( trust ) ;
2020-04-30 11:05:17 +02:00
di . setInferenceprovenance ( inference_provenance ) ;
2021-09-15 18:44:54 +02:00
di . setProvenanceaction ( getQualifier ( inference_class_id , inference_class_name , qualifierSchema ) ) ;
2020-04-30 11:05:17 +02:00
return di ;
}
2021-10-12 08:11:53 +02:00
public static Qualifier getQualifier ( String inference_class_id , String inference_class_name ,
String qualifierSchema ) {
2020-04-30 11:05:17 +02:00
Qualifier pa = new Qualifier ( ) ;
pa . setClassid ( inference_class_id ) ;
pa . setClassname ( inference_class_name ) ;
2021-09-15 18:44:54 +02:00
pa . setSchemeid ( qualifierSchema ) ;
pa . setSchemename ( qualifierSchema ) ;
2020-04-30 11:05:17 +02:00
return pa ;
}
2021-10-29 11:20:03 +02:00
public static ArrayList < Relation > getOrganizationRelationPair ( String orgId ,
String resultId ,
String classID ,
String className
) {
ArrayList < Relation > newRelations = new ArrayList ( ) ;
newRelations
. add (
getRelation (
orgId ,
resultId ,
ModelConstants . IS_AUTHOR_INSTITUTION_OF ,
ModelConstants . RESULT_ORGANIZATION ,
ModelConstants . AFFILIATION ,
PROPAGATION_DATA_INFO_TYPE ,
classID ,
className ) ) ;
newRelations
. add (
getRelation (
resultId ,
orgId ,
ModelConstants . HAS_AUTHOR_INSTITUTION ,
ModelConstants . RESULT_ORGANIZATION ,
ModelConstants . AFFILIATION ,
PROPAGATION_DATA_INFO_TYPE ,
classID ,
className ) ) ;
return newRelations ;
}
2020-04-30 11:05:17 +02:00
public static Relation getRelation (
String source ,
String target ,
String rel_class ,
String rel_type ,
String subrel_type ,
String inference_provenance ,
String inference_class_id ,
String inference_class_name ) {
Relation r = new Relation ( ) ;
r . setSource ( source ) ;
r . setTarget ( target ) ;
r . setRelClass ( rel_class ) ;
r . setRelType ( rel_type ) ;
r . setSubRelType ( subrel_type ) ;
2021-10-12 08:11:53 +02:00
r
. setDataInfo (
getDataInfo (
inference_provenance , inference_class_id , inference_class_name ,
ModelConstants . DNET_PROVENANCE_ACTIONS ) ) ;
2020-04-30 11:05:17 +02:00
return r ;
}
public static String getConstraintList ( String text , List < String > constraints ) {
2020-08-18 16:42:08 +02:00
String ret = " and ( " + text + constraints . get ( 0 ) . toLowerCase ( ) + " ' " ;
2020-04-30 11:05:17 +02:00
for ( int i = 1 ; i < constraints . size ( ) ; i + + ) {
2020-08-18 16:42:08 +02:00
ret + = " OR " + text + constraints . get ( i ) . toLowerCase ( ) + " ' " ;
2020-04-30 11:05:17 +02:00
}
ret + = " ) " ;
return ret ;
}
public static void removeOutputDir ( SparkSession spark , String path ) {
HdfsSupport . remove ( path , spark . sparkContext ( ) . hadoopConfiguration ( ) ) ;
}
public static Boolean isSparkSessionManaged ( ArgumentApplicationParser parser ) {
return Optional
. ofNullable ( parser . get ( " isSparkSessionManaged " ) )
. map ( Boolean : : valueOf )
. orElse ( Boolean . TRUE ) ;
}
public static Boolean isTest ( ArgumentApplicationParser parser ) {
return Optional
. ofNullable ( parser . get ( " isTest " ) )
. map ( Boolean : : valueOf )
. orElse ( Boolean . FALSE ) ;
}
2020-05-07 18:22:26 +02:00
public static void createCfHbforResult ( SparkSession spark ) {
org . apache . spark . sql . Dataset < Row > cfhb = spark . sql ( cfHbforResultQuery ) ;
2020-04-30 11:05:17 +02:00
cfhb . createOrReplaceTempView ( " cfhb " ) ;
}
2020-05-07 18:22:26 +02:00
public static < R > Dataset < R > readPath (
SparkSession spark , String inputPath , Class < R > clazz ) {
2020-04-30 11:05:17 +02:00
return spark
. read ( )
. textFile ( inputPath )
2020-05-07 18:22:26 +02:00
. map ( ( MapFunction < String , R > ) value - > OBJECT_MAPPER . readValue ( value , clazz ) , Encoders . bean ( clazz ) ) ;
2020-04-30 11:05:17 +02:00
}
2020-02-18 17:23:34 +01:00
}