From 236b64d8305a93cea73630c89b14d3784b923ae7 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Sat, 29 Jun 2024 18:29:20 +0200 Subject: [PATCH 1/4] [AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl --- .../eu/dnetlib/dhp/actionmanager/Constants.java | 3 +++ .../PrepareAffiliationRelations.java | 15 ++++++++++++--- .../webcrawl/CreateActionSetFromWebEntries.java | 10 ++++++---- .../input_actionset_parameter.json | 8 +++++++- .../bipaffiliations/oozie_app/workflow.xml | 6 +++++- .../PrepareAffiliationRelationsTest.java | 7 ++++--- .../actionmanager/bipaffiliations/doi_to_ror.json | 4 +++- 7 files changed, 40 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 006d3af76..73b4b77cb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -42,6 +42,9 @@ public class Constants { public static final String NULL = "NULL"; public static final String NA = "N/A"; + public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b"; + public static final String WEB_CRAWL_NAME = "Web Crawl"; + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private Constants() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index c10eb5c8c..b0b757005 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String ID_PREFIX = "50|doi_________::"; - public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference"; - public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!"; - public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref"; + public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; + public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; + public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static void main(String[] args) throws Exception { @@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable { final String dataciteInputPath = parser.get("dataciteInputPath"); log.info("dataciteInputPath: {}", dataciteInputPath); + final String webcrawlInputPath = parser.get("webCrawlInputPath"); + log.info("webcrawlInputPath: {}", webcrawlInputPath); + final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); @@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable { JavaPairRDD dataciteRelations = prepareAffiliationRelations( spark, dataciteInputPath, collectedFromDatacite); + List collectedFromWebCrawl = OafMapperUtils + .listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME); + JavaPairRDD webCrawlRelations = prepareAffiliationRelations( + spark, webcrawlInputPath, collectedFromWebCrawl); + crossrefRelations .union(pubmedRelations) .union(openAPCRelations) .union(dataciteRelations) + .union(webCrawlRelations) .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java index 27970f2c3..f4b0cbc6f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java @@ -1,12 +1,15 @@ package eu.dnetlib.dhp.actionmanager.webcrawl; + import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.actionmanager.Constants; +import io.netty.util.Constant; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; @@ -44,8 +47,7 @@ public class CreateActionSetFromWebEntries implements Serializable { private static final String PMID_PREFIX = "50|pmid________::"; private static final String PMCID_PREFIX = "50|pmc_________::"; - private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b"; - private static final String WEB_CRAWL_NAME = "Web Crawl"; + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { @@ -214,7 +216,7 @@ public class CreateActionSetFromWebEntries implements Serializable { ModelConstants.IS_AUTHOR_INSTITUTION_OF, Arrays .asList( - OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), + OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils .dataInfo( false, null, false, false, @@ -233,7 +235,7 @@ public class CreateActionSetFromWebEntries implements Serializable { ModelConstants.HAS_AUTHOR_INSTITUTION, Arrays .asList( - OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), + OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils .dataInfo( false, null, false, false, diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index 9671129f7..4d85cf26b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -28,7 +28,13 @@ "paramLongName": "dataciteInputPath", "paramDescription": "the path to get the input data from Datacite", "paramRequired": true - }, + },{ + "paramName": "wip", + "paramLongName": "webCrawlInputPath", + "paramDescription": "the path to get the input data from Web Crawl", + "paramRequired": true +} +, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index e8e6a7c33..2e89c07fd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -17,6 +17,10 @@ dataciteInputPath the path where to find the inferred affiliation relations from Datacite + + webCrawlInputPath + the path where to find the inferred affiliation relations from webCrawl + outputPath the path where to store the actionset @@ -112,7 +116,7 @@ --pubmedInputPath${pubmedInputPath} --openapcInputPath${openapcInputPath} --dataciteInputPath${dataciteInputPath} - + --webCrawlInputPath${webCrawlInputPath} --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index bceb9d754..bb0188e43 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest { "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath, + "-webCrawlInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); @@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(80, tmp.count()); + assertEquals(120, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json index 985a8d14b..08dc3f7eb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json @@ -4,4 +4,6 @@ {"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} -{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} \ No newline at end of file +{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} +{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]} +{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]} \ No newline at end of file From 9cbe966b4adcc497798af69f1a01c47aab3b5926 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Sat, 29 Jun 2024 18:35:49 +0200 Subject: [PATCH 2/4] [AffiliationIngestion]refactoring --- .../bipaffiliations/PrepareAffiliationRelations.java | 6 +++--- .../webcrawl/CreateActionSetFromWebEntries.java | 5 ++--- .../bipaffiliations/PrepareAffiliationRelationsTest.java | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index b0b757005..8f911e980 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -106,15 +106,15 @@ public class PrepareAffiliationRelations implements Serializable { spark, dataciteInputPath, collectedFromDatacite); List collectedFromWebCrawl = OafMapperUtils - .listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME); + .listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME); JavaPairRDD webCrawlRelations = prepareAffiliationRelations( - spark, webcrawlInputPath, collectedFromWebCrawl); + spark, webcrawlInputPath, collectedFromWebCrawl); crossrefRelations .union(pubmedRelations) .union(openAPCRelations) .union(dataciteRelations) - .union(webCrawlRelations) + .union(webCrawlRelations) .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java index f4b0cbc6f..533b90cd8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java @@ -1,15 +1,12 @@ package eu.dnetlib.dhp.actionmanager.webcrawl; - import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.actionmanager.Constants; -import io.netty.util.Constant; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; @@ -24,6 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.Constants; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -32,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner; import eu.dnetlib.dhp.schema.oaf.utils.PidType; +import io.netty.util.Constant; import scala.Tuple2; /** diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index bb0188e43..bffe41ac7 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -88,7 +88,7 @@ public class PrepareAffiliationRelationsTest { "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath, - "-webCrawlInputPath", crossrefAffiliationRelationPath, + "-webCrawlInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); From a2b708bb71b1f8a81f3d2ef369eb80638ade7808 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Sat, 29 Jun 2024 18:36:47 +0200 Subject: [PATCH 3/4] [AffiliationIngestion]refactoring --- .../model/ProvisionModelSupport.java | 20 ++++--- .../oa/provision/SolrConfigExploreTest.java | 56 ++++++++++--------- .../dhp/oa/provision/SolrConfigTest.java | 3 +- 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index b450b95be..abcf4992f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,7 +5,6 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.solr.ExternalReference; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -31,6 +30,7 @@ import eu.dnetlib.dhp.schema.solr.Context; import eu.dnetlib.dhp.schema.solr.Country; import eu.dnetlib.dhp.schema.solr.Datasource; import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines; +import eu.dnetlib.dhp.schema.solr.ExternalReference; import eu.dnetlib.dhp.schema.solr.Instance; import eu.dnetlib.dhp.schema.solr.Journal; import eu.dnetlib.dhp.schema.solr.Measure; @@ -562,10 +562,16 @@ public class ProvisionModelSupport { .orElse(null); } - private static List mapExternalReference(List externalReference) { - return Optional.ofNullable(externalReference) - .map(ext -> ext.stream() - .map(e -> ExternalReference.newInstance( + private static List mapExternalReference( + List externalReference) { + return Optional + .ofNullable(externalReference) + .map( + ext -> ext + .stream() + .map( + e -> ExternalReference + .newInstance( e.getSitename(), e.getLabel(), e.getAlternateLabel(), @@ -573,8 +579,8 @@ public class ProvisionModelSupport { mapCodeLabel(e.getQualifier()), e.getRefidentifier(), e.getQuery())) - .collect(Collectors.toList())) - .orElse(Lists.newArrayList()); + .collect(Collectors.toList())) + .orElse(Lists.newArrayList()); } private static List asContext(List ctxList, diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java index 41eac2a30..90aef5adc 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java @@ -1,12 +1,13 @@ package eu.dnetlib.dhp.oa.provision; +import static org.junit.jupiter.api.Assertions.assertEquals; + import java.io.File; import java.io.IOException; import java.net.URI; import java.nio.file.Path; -import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.solr.client.solrj.SolrQuery; @@ -32,14 +33,13 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; - -import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.junit.jupiter.api.Assertions.assertEquals; +import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; +import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class SolrConfigExploreTest { @@ -91,7 +91,7 @@ public class SolrConfigExploreTest { SparkConf conf = new SparkConf(); conf.setAppName(XmlIndexingJobTest.class.getSimpleName()); conf.registerKryoClasses(new Class[] { - SerializableSolrInputDocument.class + SerializableSolrInputDocument.class }); conf.setMaster("local[1]"); @@ -101,10 +101,10 @@ public class SolrConfigExploreTest { conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString()); spark = SparkSession - .builder() - .appName(SolrConfigExploreTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); + .builder() + .appName(SolrConfigExploreTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); // random unassigned HTTP port final int jettyPort = 0; @@ -134,35 +134,35 @@ public class SolrConfigExploreTest { log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString()); log - .info( - CollectionAdminRequest.ClusterStatus - .getClusterStatus() - .process(miniCluster.getSolrClient()) - .toString()); + .info( + CollectionAdminRequest.ClusterStatus + .getClusterStatus() + .process(miniCluster.getSolrClient()) + .toString()); NamedList res = createCollection( - miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME); + miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME); res.forEach(o -> log.info(o.toString())); // miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION); res = createCollection( - miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME); + miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME); res.forEach(o -> log.info(o.toString())); admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress()); CollectionAdminResponse rsp = (CollectionAdminResponse) admin - .createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION); + .createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION); assertEquals(0, rsp.getStatus()); rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION); assertEquals(0, rsp.getStatus()); log - .info( - CollectionAdminRequest.ClusterStatus - .getClusterStatus() - .process(miniCluster.getSolrClient()) - .toString()); + .info( + CollectionAdminRequest.ClusterStatus + .getClusterStatus() + .process(miniCluster.getSolrClient()) + .toString()); } @@ -180,7 +180,8 @@ public class SolrConfigExploreTest { new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) .run(isLookupClient); - Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); + Assertions + .assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); String[] queryStrings = { "cancer", @@ -200,7 +201,8 @@ public class SolrConfigExploreTest { // System.out.println(rsp.getExplainMap()); for (SolrDocument doc : rsp.getResults()) { - log.info( + log + .info( doc.get("score") + "\t" + doc.get("__indexrecordidentifier") + "\t" + doc.get("resultidentifier") + "\t" + @@ -216,7 +218,7 @@ public class SolrConfigExploreTest { } protected static NamedList createCollection(CloudSolrClient client, String name, int numShards, - int replicationFactor, int maxShardsPerNode, String configName) throws Exception { + int replicationFactor, int maxShardsPerNode, String configName) throws Exception { ModifiableSolrParams modParams = new ModifiableSolrParams(); modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name()); modParams.set("name", name); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java index 2c62389c6..c04fa1cc6 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java @@ -85,7 +85,8 @@ public class SolrConfigTest extends SolrTest { new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) .run(isLookupClient); - Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); + Assertions + .assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); String[] queryStrings = { "cancer", From f17e1243ba4974942d2ce5062d85b90305320b70 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Jul 2024 10:23:50 +0200 Subject: [PATCH 4/4] reverted changed contens under dhp-graph-provision --- .../oa/provision/SolrAdminApplication.java | 2 + .../model/ProvisionModelSupport.java | 20 +++---- .../oa/provision/SolrConfigExploreTest.java | 54 +++++++++---------- .../dhp/oa/provision/SolrConfigTest.java | 3 +- 4 files changed, 36 insertions(+), 43 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java index 44426e8c5..2bf7d3fbb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java @@ -128,12 +128,14 @@ public class SolrAdminApplication implements Closeable { public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException { CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName); + log.info("deleting alias: {}", aliasName); return deleteAliasRequest.process(solrClient); } public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException { CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest .createAlias(aliasName, collection); + log.info("creating alias: {} for collection: {}", aliasName, collection); return createAliasRequest.process(solrClient); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index abcf4992f..b450b95be 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,6 +5,7 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.solr.ExternalReference; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -30,7 +31,6 @@ import eu.dnetlib.dhp.schema.solr.Context; import eu.dnetlib.dhp.schema.solr.Country; import eu.dnetlib.dhp.schema.solr.Datasource; import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines; -import eu.dnetlib.dhp.schema.solr.ExternalReference; import eu.dnetlib.dhp.schema.solr.Instance; import eu.dnetlib.dhp.schema.solr.Journal; import eu.dnetlib.dhp.schema.solr.Measure; @@ -562,16 +562,10 @@ public class ProvisionModelSupport { .orElse(null); } - private static List mapExternalReference( - List externalReference) { - return Optional - .ofNullable(externalReference) - .map( - ext -> ext - .stream() - .map( - e -> ExternalReference - .newInstance( + private static List mapExternalReference(List externalReference) { + return Optional.ofNullable(externalReference) + .map(ext -> ext.stream() + .map(e -> ExternalReference.newInstance( e.getSitename(), e.getLabel(), e.getAlternateLabel(), @@ -579,8 +573,8 @@ public class ProvisionModelSupport { mapCodeLabel(e.getQualifier()), e.getRefidentifier(), e.getQuery())) - .collect(Collectors.toList())) - .orElse(Lists.newArrayList()); + .collect(Collectors.toList())) + .orElse(Lists.newArrayList()); } private static List asContext(List ctxList, diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java index 90aef5adc..41eac2a30 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java @@ -1,13 +1,12 @@ package eu.dnetlib.dhp.oa.provision; -import static org.junit.jupiter.api.Assertions.assertEquals; - import java.io.File; import java.io.IOException; import java.net.URI; import java.nio.file.Path; +import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.solr.client.solrj.SolrQuery; @@ -33,13 +32,14 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.junit.jupiter.api.Assertions.assertEquals; @ExtendWith(MockitoExtension.class) public class SolrConfigExploreTest { @@ -91,7 +91,7 @@ public class SolrConfigExploreTest { SparkConf conf = new SparkConf(); conf.setAppName(XmlIndexingJobTest.class.getSimpleName()); conf.registerKryoClasses(new Class[] { - SerializableSolrInputDocument.class + SerializableSolrInputDocument.class }); conf.setMaster("local[1]"); @@ -101,10 +101,10 @@ public class SolrConfigExploreTest { conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString()); spark = SparkSession - .builder() - .appName(SolrConfigExploreTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); + .builder() + .appName(SolrConfigExploreTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); // random unassigned HTTP port final int jettyPort = 0; @@ -134,35 +134,35 @@ public class SolrConfigExploreTest { log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString()); log - .info( - CollectionAdminRequest.ClusterStatus - .getClusterStatus() - .process(miniCluster.getSolrClient()) - .toString()); + .info( + CollectionAdminRequest.ClusterStatus + .getClusterStatus() + .process(miniCluster.getSolrClient()) + .toString()); NamedList res = createCollection( - miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME); + miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME); res.forEach(o -> log.info(o.toString())); // miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION); res = createCollection( - miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME); + miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME); res.forEach(o -> log.info(o.toString())); admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress()); CollectionAdminResponse rsp = (CollectionAdminResponse) admin - .createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION); + .createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION); assertEquals(0, rsp.getStatus()); rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION); assertEquals(0, rsp.getStatus()); log - .info( - CollectionAdminRequest.ClusterStatus - .getClusterStatus() - .process(miniCluster.getSolrClient()) - .toString()); + .info( + CollectionAdminRequest.ClusterStatus + .getClusterStatus() + .process(miniCluster.getSolrClient()) + .toString()); } @@ -180,8 +180,7 @@ public class SolrConfigExploreTest { new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) .run(isLookupClient); - Assertions - .assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); + Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); String[] queryStrings = { "cancer", @@ -201,8 +200,7 @@ public class SolrConfigExploreTest { // System.out.println(rsp.getExplainMap()); for (SolrDocument doc : rsp.getResults()) { - log - .info( + log.info( doc.get("score") + "\t" + doc.get("__indexrecordidentifier") + "\t" + doc.get("resultidentifier") + "\t" + @@ -218,7 +216,7 @@ public class SolrConfigExploreTest { } protected static NamedList createCollection(CloudSolrClient client, String name, int numShards, - int replicationFactor, int maxShardsPerNode, String configName) throws Exception { + int replicationFactor, int maxShardsPerNode, String configName) throws Exception { ModifiableSolrParams modParams = new ModifiableSolrParams(); modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name()); modParams.set("name", name); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java index c04fa1cc6..2c62389c6 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java @@ -85,8 +85,7 @@ public class SolrConfigTest extends SolrTest { new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) .run(isLookupClient); - Assertions - .assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); + Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); String[] queryStrings = { "cancer",