From 4a894657406d37a608bf82c1fe8fd18da87b7a5c Mon Sep 17 00:00:00 2001 From: sandro Date: Wed, 29 Apr 2020 13:24:29 +0200 Subject: [PATCH] reformatted code --- .../GenerateOoziePropertiesMojo.java | 101 +- .../WritePredefinedProjectProperties.java | 731 ++--- .../GenerateOoziePropertiesMojoTest.java | 131 +- .../WritePredefinedProjectPropertiesTest.java | 576 ++-- .../collector/worker/model/ApiDescriptor.java | 57 +- .../mdstore/manager/common/model/MDStore.java | 162 +- .../common/model/MDStoreCurrentVersion.java | 58 +- .../manager/common/model/MDStoreVersion.java | 126 +- .../manager/common/model/MDStoreWithInfo.java | 186 +- .../ArgumentApplicationParser.java | 136 +- .../dhp/application/OptionsParameter.java | 50 +- .../common/FunctionalInterfaceSupport.java | 79 +- .../eu/dnetlib/dhp/common/HdfsSupport.java | 103 +- .../dhp/common/SparkSessionSupport.java | 121 +- .../dnetlib/dhp/common/ThrowingSupport.java | 124 +- .../dhp/model/mdstore/MetadataRecord.java | 175 +- .../dnetlib/dhp/model/mdstore/Provenance.java | 61 +- .../dhp/parser/utility/VtdException.java | 13 +- .../dhp/parser/utility/VtdUtilityParser.java | 169 +- .../java/eu/dnetlib/dhp/utils/DHPUtils.java | 107 +- .../dhp/utils/ISLookupClientFactory.java | 28 +- .../saxon/AbstractExtensionFunction.java | 34 +- .../dnetlib/dhp/utils/saxon/ExtractYear.java | 94 +- .../dhp/utils/saxon/NormalizeDate.java | 92 +- .../eu/dnetlib/dhp/utils/saxon/PickFirst.java | 82 +- .../utils/saxon/SaxonTransformerFactory.java | 33 +- .../main/java/eu/dnetlib/message/Message.java | 103 +- .../eu/dnetlib/message/MessageConsumer.java | 68 +- .../eu/dnetlib/message/MessageManager.java | 208 +- .../java/eu/dnetlib/message/MessageType.java | 4 +- .../scholexplorer/relation/RelInfo.java | 29 +- .../relation/RelationMapper.java | 16 +- .../ArgumentApplicationParserTest.java | 110 +- .../dnetlib/dhp/common/HdfsSupportTest.java | 99 +- .../dhp/common/SparkSessionSupportTest.java | 75 +- .../dhp/model/mdstore/MetadataRecordTest.java | 11 +- .../java/eu/dnetlib/message/MessageTest.java | 67 +- .../relation/RelationMapperTest.java | 11 +- .../dhp/schema/action/AtomicAction.java | 46 +- .../action/AtomicActionDeserializer.java | 33 +- .../dnetlib/dhp/schema/common/EntityType.java | 29 +- .../dhp/schema/common/MainEntityType.java | 6 +- .../dhp/schema/common/ModelConstants.java | 55 +- .../dhp/schema/common/ModelSupport.java | 349 +-- .../eu/dnetlib/dhp/schema/oaf/Author.java | 119 +- .../eu/dnetlib/dhp/schema/oaf/Context.java | 56 +- .../eu/dnetlib/dhp/schema/oaf/Country.java | 42 +- .../eu/dnetlib/dhp/schema/oaf/DataInfo.java | 122 +- .../eu/dnetlib/dhp/schema/oaf/Dataset.java | 151 +- .../eu/dnetlib/dhp/schema/oaf/Datasource.java | 786 +++--- .../dhp/schema/oaf/ExternalReference.java | 170 +- .../eu/dnetlib/dhp/schema/oaf/ExtraInfo.java | 105 +- .../java/eu/dnetlib/dhp/schema/oaf/Field.java | 56 +- .../dnetlib/dhp/schema/oaf/GeoLocation.java | 101 +- .../eu/dnetlib/dhp/schema/oaf/Instance.java | 209 +- .../eu/dnetlib/dhp/schema/oaf/Journal.java | 240 +- .../eu/dnetlib/dhp/schema/oaf/KeyValue.java | 97 +- .../dnetlib/dhp/schema/oaf/OAIProvenance.java | 39 +- .../java/eu/dnetlib/dhp/schema/oaf/Oaf.java | 91 +- .../eu/dnetlib/dhp/schema/oaf/OafEntity.java | 168 +- .../dnetlib/dhp/schema/oaf/Organization.java | 323 ++- .../dhp/schema/oaf/OriginDescription.java | 119 +- .../dhp/schema/oaf/OtherResearchProduct.java | 76 +- .../eu/dnetlib/dhp/schema/oaf/Project.java | 534 ++-- .../dnetlib/dhp/schema/oaf/Publication.java | 47 +- .../eu/dnetlib/dhp/schema/oaf/Qualifier.java | 123 +- .../eu/dnetlib/dhp/schema/oaf/Relation.java | 142 +- .../eu/dnetlib/dhp/schema/oaf/Result.java | 400 +-- .../eu/dnetlib/dhp/schema/oaf/Software.java | 102 +- .../dhp/schema/oaf/StructuredProperty.java | 76 +- .../dhp/schema/scholexplorer/DLIDataset.java | 124 +- .../schema/scholexplorer/DLIPublication.java | 126 +- .../dhp/schema/scholexplorer/DLIRelation.java | 15 +- .../dhp/schema/scholexplorer/DLIUnknown.java | 156 +- .../schema/scholexplorer/ProvenaceInfo.java | 57 +- .../dhp/schema/action/AtomicActionTest.java | 40 +- .../dhp/schema/common/ModelSupportTest.java | 42 +- .../eu/dnetlib/dhp/schema/oaf/MergeTest.java | 108 +- .../dhp/schema/scholexplorer/DLItest.java | 119 +- .../dnetlib/dhp/actionmanager/ISClient.java | 219 +- .../migration/LicenseComparator.java | 78 +- .../migration/MigrateActionSet.java | 266 +- .../migration/ProtoConverter.java | 1113 ++++---- .../migration/TransformActions.java | 244 +- .../PartitionActionSetsByPayloadTypeJob.java | 188 +- .../actionmanager/promote/MergeAndGet.java | 135 +- .../PromoteActionPayloadForGraphTableJob.java | 349 ++- .../PromoteActionPayloadFunctions.java | 299 +- ...rtitionActionSetsByPayloadTypeJobTest.java | 335 +-- .../promote/MergeAndGetTest.java | 387 ++- ...moteActionPayloadForGraphTableJobTest.java | 450 +-- .../PromoteActionPayloadFunctionsTest.java | 547 ++-- .../GenerateNativeStoreSparkJob.java | 234 +- .../collection/plugin/CollectorPlugin.java | 6 +- .../plugin/oai/OaiCollectorPlugin.java | 114 +- .../collection/plugin/oai/OaiIterator.java | 279 +- .../plugin/oai/OaiIteratorFactory.java | 31 +- .../worker/DnetCollectorException.java | 43 +- .../worker/DnetCollectorWorker.java | 211 +- .../DnetCollectorWorkerApplication.java | 63 +- .../utils/CollectorPluginErrorLogList.java | 21 +- .../worker/utils/CollectorPluginFactory.java | 20 +- .../worker/utils/HttpConnector.java | 379 +-- .../collection/worker/utils/XmlCleaner.java | 715 +++-- .../dhp/transformation/TransformFunction.java | 119 +- .../transformation/TransformSparkJobNode.java | 151 +- .../dhp/transformation/functions/Cleaner.java | 68 +- .../dhp/transformation/vocabulary/Term.java | 71 +- .../transformation/vocabulary/Vocabulary.java | 71 +- .../vocabulary/VocabularyHelper.java | 21 +- .../dhp/collection/CollectionJobTest.java | 187 +- .../DnetCollectorWorkerApplicationTests.java | 132 +- .../transformation/TransformationJobTest.java | 223 +- .../vocabulary/VocabularyTest.java | 11 +- .../dhp/oa/dedup/AbstractSparkAction.java | 116 +- .../eu/dnetlib/dhp/oa/dedup/DatePicker.java | 174 +- .../dhp/oa/dedup/DedupRecordFactory.java | 153 +- .../eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 389 +-- .../java/eu/dnetlib/dhp/oa/dedup/Deduper.java | 81 +- .../dhp/oa/dedup/SparkCreateDedupRecord.java | 147 +- .../dhp/oa/dedup/SparkCreateMergeRels.java | 230 +- .../dhp/oa/dedup/SparkCreateSimRels.java | 190 +- .../dhp/oa/dedup/SparkPropagateRelation.java | 285 +- .../dnetlib/dhp/oa/dedup/SparkReporter.java | 57 +- .../dhp/oa/dedup/SparkUpdateEntity.java | 203 +- .../oa/dedup/graph/ConnectedComponent.java | 120 +- .../eu/dnetlib/dhp/oa/dedup/model/Block.java | 101 +- .../dnetlib/dhp/oa/dedup/MergeAuthorTest.java | 67 +- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 727 ++--- .../dhp/oa/dedup/jpath/JsonPathTest.java | 559 ++-- .../java/eu/dnetlib/dedup/DatePicker.java | 174 +- .../eu/dnetlib/dedup/DedupRecordFactory.java | 502 ++-- .../java/eu/dnetlib/dedup/DedupUtility.java | 361 +-- .../main/java/eu/dnetlib/dedup/Deduper.java | 298 +- .../java/eu/dnetlib/dedup/OafEntityType.java | 9 +- .../dedup/SparkCreateConnectedComponent.java | 164 +- .../dnetlib/dedup/SparkCreateDedupRecord.java | 78 +- .../eu/dnetlib/dedup/SparkCreateSimRels.java | 117 +- .../java/eu/dnetlib/dedup/SparkReporter.java | 59 +- .../dedup/graph/ConnectedComponent.java | 120 +- .../dedup/sx/SparkPropagateRelationsJob.java | 176 +- .../dedup/sx/SparkUpdateEntityJob.java | 157 +- .../doiboost/crossref/CrossrefImporter.java | 148 +- .../dnetlib/doiboost/crossref/ESClient.java | 166 +- .../orcid/ActivitiesDecompressor.java | 221 +- .../orcid/OrcidAuthorsDOIsDataGen.java | 67 +- .../doiboost/orcid/OrcidDSManager.java | 121 +- .../doiboost/orcid/SummariesDecompressor.java | 248 +- .../doiboost/orcid/json/JsonWriter.java | 34 +- .../doiboost/orcid/model/AuthorData.java | 71 +- .../doiboost/orcid/model/WorkData.java | 57 +- .../doiboost/orcid/xml/XMLRecordParser.java | 188 +- .../eu/dnetlib/doiboost/DoiBoostTest.java | 30 +- .../orcid/xml/XMLRecordParserTest.java | 73 +- .../oa/graph/hive/GraphHiveImporterJob.java | 98 +- .../raw/AbstractMdRecordToOafMapper.java | 869 +++--- .../raw/DispatchEntitiesApplication.java | 114 +- .../raw/GenerateEntitiesApplication.java | 294 +- .../oa/graph/raw/MergeClaimsApplication.java | 186 +- .../raw/MigrateDbEntitiesApplication.java | 1017 +++---- .../raw/MigrateMongoMdstoresApplication.java | 95 +- .../dhp/oa/graph/raw/OafToOafMapper.java | 427 +-- .../dhp/oa/graph/raw/OdfToOafMapper.java | 591 ++-- .../common/AbstractMigrationApplication.java | 109 +- .../dhp/oa/graph/raw/common/DbClient.java | 81 +- .../oa/graph/raw/common/MdstoreClient.java | 148 +- .../graph/raw/common/MigrationConstants.java | 40 +- .../oa/graph/raw/common/OafMapperUtils.java | 369 +-- .../dhp/oa/graph/raw/common/PacePerson.java | 287 +- .../dhp/sx/graph/ImportDataFromMongo.java | 240 +- .../dhp/sx/graph/SparkExtractEntitiesJob.java | 193 +- .../sx/graph/SparkSXGeneratePidSimlarity.java | 112 +- .../SparkScholexplorerCreateRawGraphJob.java | 437 ++- .../SparkScholexplorerGraphImporter.java | 100 +- .../parser/AbstractScholexplorerParser.java | 374 +-- .../parser/DatasetScholexplorerParser.java | 474 ++-- .../PublicationScholexplorerParser.java | 410 +-- .../oa/graph/GraphHiveImporterJobTest.java | 122 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 208 +- .../raw/MigrateDbEntitiesApplicationTest.java | 540 ++-- .../dhp/sx/graph/ScholexplorerParserTest.java | 43 +- .../SparkScholexplorerGraphImporterTest.java | 4 +- ...parkScholexplorerMergeEntitiesJobTest.java | 4 +- .../dnetlib/dhp/provision/ProvisionUtil.java | 70 +- .../dhp/provision/RelatedItemInfo.java | 74 +- .../provision/SparkExtractRelationCount.java | 42 +- .../dhp/provision/SparkGenerateScholix.java | 161 +- .../dhp/provision/SparkGenerateSummary.java | 166 +- .../provision/SparkIndexCollectionOnES.java | 99 +- .../dhp/provision/scholix/Scholix.java | 310 +- .../scholix/ScholixCollectedFrom.java | 58 +- .../provision/scholix/ScholixEntityId.java | 40 +- .../provision/scholix/ScholixIdentifier.java | 40 +- .../scholix/ScholixRelationship.java | 56 +- .../provision/scholix/ScholixResource.java | 216 +- .../scholix/summary/CollectedFromType.java | 56 +- .../scholix/summary/SchemeValue.java | 40 +- .../scholix/summary/ScholixSummary.java | 530 ++-- .../scholix/summary/TypedIdentifier.java | 40 +- .../provision/scholix/summary/Typology.java | 5 +- .../provision/update/CrossRefParserJSON.java | 202 +- .../dhp/provision/update/CrossrefClient.java | 133 +- .../provision/update/Datacite2Scholix.java | 389 +-- .../dhp/provision/update/DataciteClient.java | 103 +- .../update/DataciteClientIterator.java | 180 +- .../update/RetrieveUpdateFromDatacite.java | 105 +- .../update/SparkResolveScholixTarget.java | 287 +- .../dhp/provision/DataciteClientTest.java | 70 +- .../dhp/provision/ExtractInfoTest.java | 39 +- .../oa/provision/AdjacencyListBuilderJob.java | 181 +- .../CreateRelatedEntitiesJob_phase1.java | 373 ++- .../CreateRelatedEntitiesJob_phase2.java | 339 ++- .../dhp/oa/provision/PrepareRelationsJob.java | 242 +- .../dhp/oa/provision/XmlConverterJob.java | 335 +-- .../dhp/oa/provision/XmlIndexingJob.java | 342 +-- .../oa/provision/model/EntityRelEntity.java | 93 +- .../dhp/oa/provision/model/JoinedEntity.java | 32 +- .../dhp/oa/provision/model/RelatedEntity.java | 456 +-- .../oa/provision/model/SortableRelation.java | 50 +- .../dhp/oa/provision/model/Tuple2.java | 67 +- .../dhp/oa/provision/model/TypedRow.java | 84 +- .../dhp/oa/provision/utils/ContextDef.java | 71 +- .../dhp/oa/provision/utils/ContextMapper.java | 59 +- .../oa/provision/utils/GraphMappingUtils.java | 26 +- .../oa/provision/utils/LicenseComparator.java | 78 +- .../provision/utils/RelationPartitioner.java | 33 +- .../utils/StreamingInputDocumentFactory.java | 376 ++- .../oa/provision/utils/TemplateFactory.java | 176 +- .../oa/provision/utils/TemplateResources.java | 61 +- .../oa/provision/utils/XmlRecordFactory.java | 2482 +++++++++-------- .../utils/XmlSerializationUtils.java | 261 +- .../dhp/oa/provision/GraphJoinerTest.java | 57 +- 232 files changed, 22079 insertions(+), 20939 deletions(-) diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java index ccc2abef02..10a25fdc30 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -1,8 +1,10 @@ + package eu.dnetlib.maven.plugin.properties; import java.io.File; import java.util.ArrayList; import java.util.List; + import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.maven.plugin.AbstractMojo; @@ -17,55 +19,58 @@ import org.apache.maven.plugin.MojoFailureException; */ public class GenerateOoziePropertiesMojo extends AbstractMojo { - public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; - public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; + public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; + public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; - private final String[] limiters = {"dhp", "dnetlib", "eu"}; + private final String[] limiters = { + "dhp", "dnetlib", "eu" + }; - @Override - public void execute() throws MojoExecutionException, MojoFailureException { - if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) - && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { - String generatedSandboxName = - generateSandboxName(System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); - if (generatedSandboxName != null) { - System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); - } else { - System.out.println( - "unable to generate sandbox name from path: " - + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); - } - } - } + @Override + public void execute() throws MojoExecutionException, MojoFailureException { + if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) + && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { + String generatedSandboxName = generateSandboxName( + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + if (generatedSandboxName != null) { + System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); + } else { + System.out + .println( + "unable to generate sandbox name from path: " + + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + } + } + } - /** - * Generates sandbox name from workflow source directory. - * - * @param wfSourceDir - * @return generated sandbox name - */ - private String generateSandboxName(String wfSourceDir) { - // utilize all dir names until finding one of the limiters - List sandboxNameParts = new ArrayList(); - String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); - ArrayUtils.reverse(tokens); - if (tokens.length > 0) { - for (String token : tokens) { - for (String limiter : limiters) { - if (limiter.equals(token)) { - return sandboxNameParts.size() > 0 - ? StringUtils.join(sandboxNameParts.toArray()) - : null; - } - } - if (sandboxNameParts.size() > 0) { - sandboxNameParts.add(0, File.separator); - } - sandboxNameParts.add(0, token); - } - return StringUtils.join(sandboxNameParts.toArray()); - } else { - return null; - } - } + /** + * Generates sandbox name from workflow source directory. + * + * @param wfSourceDir + * @return generated sandbox name + */ + private String generateSandboxName(String wfSourceDir) { + // utilize all dir names until finding one of the limiters + List sandboxNameParts = new ArrayList(); + String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); + ArrayUtils.reverse(tokens); + if (tokens.length > 0) { + for (String token : tokens) { + for (String limiter : limiters) { + if (limiter.equals(token)) { + return sandboxNameParts.size() > 0 + ? StringUtils.join(sandboxNameParts.toArray()) + : null; + } + } + if (sandboxNameParts.size() > 0) { + sandboxNameParts.add(0, File.separator); + } + sandboxNameParts.add(0, token); + } + return StringUtils.join(sandboxNameParts.toArray()); + } else { + return null; + } + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java index 5e0166e4f7..c1c567f954 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java @@ -9,9 +9,9 @@ * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ + package eu.dnetlib.maven.plugin.properties; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; @@ -35,6 +36,8 @@ import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.Resource; import org.springframework.core.io.ResourceLoader; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; + /** * Writes project properties for the keys listed in specified properties files. Based on: * http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html @@ -44,401 +47,401 @@ import org.springframework.core.io.ResourceLoader; */ public class WritePredefinedProjectProperties extends AbstractMojo { - private static final String CR = "\r"; - private static final String LF = "\n"; - private static final String TAB = "\t"; - protected static final String PROPERTY_PREFIX_ENV = "env."; - private static final String ENCODING_UTF8 = "utf8"; + private static final String CR = "\r"; + private static final String LF = "\n"; + private static final String TAB = "\t"; + protected static final String PROPERTY_PREFIX_ENV = "env."; + private static final String ENCODING_UTF8 = "utf8"; - /** @parameter property="properties.includePropertyKeysFromFiles" */ - private String[] includePropertyKeysFromFiles; + /** @parameter property="properties.includePropertyKeysFromFiles" */ + private String[] includePropertyKeysFromFiles; - /** - * @parameter default-value="${project}" - * @required - * @readonly - */ - protected MavenProject project; + /** + * @parameter default-value="${project}" + * @required + * @readonly + */ + protected MavenProject project; - /** - * The file that properties will be written to - * - * @parameter property="properties.outputFile" - * default-value="${project.build.directory}/properties/project.properties"; - * @required - */ - protected File outputFile; + /** + * The file that properties will be written to + * + * @parameter property="properties.outputFile" + * default-value="${project.build.directory}/properties/project.properties"; + * @required + */ + protected File outputFile; - /** - * If true, the plugin will silently ignore any non-existent properties files, and the build will - * continue - * - * @parameter property="properties.quiet" default-value="true" - */ - private boolean quiet; + /** + * If true, the plugin will silently ignore any non-existent properties files, and the build will continue + * + * @parameter property="properties.quiet" default-value="true" + */ + private boolean quiet; - /** - * Comma separated list of characters to escape when writing property values. cr=carriage return, - * lf=linefeed, tab=tab. Any other values are taken literally. - * - * @parameter default-value="cr,lf,tab" property="properties.escapeChars" - */ - private String escapeChars; + /** + * Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed, + * tab=tab. Any other values are taken literally. + * + * @parameter default-value="cr,lf,tab" property="properties.escapeChars" + */ + private String escapeChars; - /** - * If true, the plugin will include system properties when writing the properties file. System - * properties override both environment variables and project properties. - * - * @parameter default-value="false" property="properties.includeSystemProperties" - */ - private boolean includeSystemProperties; + /** + * If true, the plugin will include system properties when writing the properties file. System properties override + * both environment variables and project properties. + * + * @parameter default-value="false" property="properties.includeSystemProperties" + */ + private boolean includeSystemProperties; - /** - * If true, the plugin will include environment variables when writing the properties file. - * Environment variables are prefixed with "env". Environment variables override project - * properties. - * - * @parameter default-value="false" property="properties.includeEnvironmentVariables" - */ - private boolean includeEnvironmentVariables; + /** + * If true, the plugin will include environment variables when writing the properties file. Environment variables + * are prefixed with "env". Environment variables override project properties. + * + * @parameter default-value="false" property="properties.includeEnvironmentVariables" + */ + private boolean includeEnvironmentVariables; - /** - * Comma separated set of properties to exclude when writing the properties file - * - * @parameter property="properties.exclude" - */ - private String exclude; + /** + * Comma separated set of properties to exclude when writing the properties file + * + * @parameter property="properties.exclude" + */ + private String exclude; - /** - * Comma separated set of properties to write to the properties file. If provided, only the - * properties matching those supplied here will be written to the properties file. - * - * @parameter property="properties.include" - */ - private String include; + /** + * Comma separated set of properties to write to the properties file. If provided, only the properties matching + * those supplied here will be written to the properties file. + * + * @parameter property="properties.include" + */ + private String include; - /* - * (non-Javadoc) - * @see org.apache.maven.plugin.AbstractMojo#execute() - */ - @Override - @SuppressFBWarnings({"NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD"}) - public void execute() throws MojoExecutionException, MojoFailureException { - Properties properties = new Properties(); - // Add project properties - properties.putAll(project.getProperties()); - if (includeEnvironmentVariables) { - // Add environment variables, overriding any existing properties with the same key - properties.putAll(getEnvironmentVariables()); - } - if (includeSystemProperties) { - // Add system properties, overriding any existing properties with the same key - properties.putAll(System.getProperties()); - } + /* + * (non-Javadoc) + * @see org.apache.maven.plugin.AbstractMojo#execute() + */ + @Override + @SuppressFBWarnings({ + "NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD" + }) + public void execute() throws MojoExecutionException, MojoFailureException { + Properties properties = new Properties(); + // Add project properties + properties.putAll(project.getProperties()); + if (includeEnvironmentVariables) { + // Add environment variables, overriding any existing properties with the same key + properties.putAll(getEnvironmentVariables()); + } + if (includeSystemProperties) { + // Add system properties, overriding any existing properties with the same key + properties.putAll(System.getProperties()); + } - // Remove properties as appropriate - trim(properties, exclude, include); + // Remove properties as appropriate + trim(properties, exclude, include); - String comment = "# " + new Date() + "\n"; - List escapeTokens = getEscapeChars(escapeChars); + String comment = "# " + new Date() + "\n"; + List escapeTokens = getEscapeChars(escapeChars); - getLog().info("Creating " + outputFile); - writeProperties(outputFile, comment, properties, escapeTokens); - } + getLog().info("Creating " + outputFile); + writeProperties(outputFile, comment, properties, escapeTokens); + } - /** - * Provides environment variables. - * - * @return environment variables - */ - protected static Properties getEnvironmentVariables() { - Properties props = new Properties(); - for (Entry entry : System.getenv().entrySet()) { - props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue()); - } - return props; - } + /** + * Provides environment variables. + * + * @return environment variables + */ + protected static Properties getEnvironmentVariables() { + Properties props = new Properties(); + for (Entry entry : System.getenv().entrySet()) { + props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue()); + } + return props; + } - /** - * Removes properties which should not be written. - * - * @param properties - * @param omitCSV - * @param includeCSV - * @throws MojoExecutionException - */ - protected void trim(Properties properties, String omitCSV, String includeCSV) - throws MojoExecutionException { - List omitKeys = getListFromCSV(omitCSV); - for (String key : omitKeys) { - properties.remove(key); - } + /** + * Removes properties which should not be written. + * + * @param properties + * @param omitCSV + * @param includeCSV + * @throws MojoExecutionException + */ + protected void trim(Properties properties, String omitCSV, String includeCSV) + throws MojoExecutionException { + List omitKeys = getListFromCSV(omitCSV); + for (String key : omitKeys) { + properties.remove(key); + } - List includeKeys = getListFromCSV(includeCSV); - // mh: including keys from predefined properties - if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { - for (String currentIncludeLoc : includePropertyKeysFromFiles) { - if (validate(currentIncludeLoc)) { - Properties p = getProperties(currentIncludeLoc); - for (String key : p.stringPropertyNames()) { - includeKeys.add(key); - } - } - } - } - if (includeKeys != null && !includeKeys.isEmpty()) { - // removing only when include keys provided - Set keys = properties.stringPropertyNames(); - for (String key : keys) { - if (!includeKeys.contains(key)) { - properties.remove(key); - } - } - } - } + List includeKeys = getListFromCSV(includeCSV); + // mh: including keys from predefined properties + if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { + for (String currentIncludeLoc : includePropertyKeysFromFiles) { + if (validate(currentIncludeLoc)) { + Properties p = getProperties(currentIncludeLoc); + for (String key : p.stringPropertyNames()) { + includeKeys.add(key); + } + } + } + } + if (includeKeys != null && !includeKeys.isEmpty()) { + // removing only when include keys provided + Set keys = properties.stringPropertyNames(); + for (String key : keys) { + if (!includeKeys.contains(key)) { + properties.remove(key); + } + } + } + } - /** - * Checks whether file exists. - * - * @param location - * @return true when exists, false otherwise. - */ - protected boolean exists(String location) { - if (StringUtils.isBlank(location)) { - return false; - } - File file = new File(location); - if (file.exists()) { - return true; - } - ResourceLoader loader = new DefaultResourceLoader(); - Resource resource = loader.getResource(location); - return resource.exists(); - } + /** + * Checks whether file exists. + * + * @param location + * @return true when exists, false otherwise. + */ + protected boolean exists(String location) { + if (StringUtils.isBlank(location)) { + return false; + } + File file = new File(location); + if (file.exists()) { + return true; + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.exists(); + } - /** - * Validates resource location. - * - * @param location - * @return true when valid, false otherwise - * @throws MojoExecutionException - */ - protected boolean validate(String location) throws MojoExecutionException { - boolean exists = exists(location); - if (exists) { - return true; - } - if (quiet) { - getLog().info("Ignoring non-existent properties file '" + location + "'"); - return false; - } else { - throw new MojoExecutionException("Non-existent properties file '" + location + "'"); - } - } + /** + * Validates resource location. + * + * @param location + * @return true when valid, false otherwise + * @throws MojoExecutionException + */ + protected boolean validate(String location) throws MojoExecutionException { + boolean exists = exists(location); + if (exists) { + return true; + } + if (quiet) { + getLog().info("Ignoring non-existent properties file '" + location + "'"); + return false; + } else { + throw new MojoExecutionException("Non-existent properties file '" + location + "'"); + } + } - /** - * Provides input stream. - * - * @param location - * @return input stream - * @throws IOException - */ - protected InputStream getInputStream(String location) throws IOException { - File file = new File(location); - if (file.exists()) { - return new FileInputStream(location); - } - ResourceLoader loader = new DefaultResourceLoader(); - Resource resource = loader.getResource(location); - return resource.getInputStream(); - } + /** + * Provides input stream. + * + * @param location + * @return input stream + * @throws IOException + */ + protected InputStream getInputStream(String location) throws IOException { + File file = new File(location); + if (file.exists()) { + return new FileInputStream(location); + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.getInputStream(); + } - /** - * Creates properties for given location. - * - * @param location - * @return properties for given location - * @throws MojoExecutionException - */ - protected Properties getProperties(String location) throws MojoExecutionException { - InputStream in = null; - try { - Properties properties = new Properties(); - in = getInputStream(location); - if (location.toLowerCase().endsWith(".xml")) { - properties.loadFromXML(in); - } else { - properties.load(in); - } - return properties; - } catch (IOException e) { - throw new MojoExecutionException("Error reading properties file " + location, e); - } finally { - IOUtils.closeQuietly(in); - } - } + /** + * Creates properties for given location. + * + * @param location + * @return properties for given location + * @throws MojoExecutionException + */ + protected Properties getProperties(String location) throws MojoExecutionException { + InputStream in = null; + try { + Properties properties = new Properties(); + in = getInputStream(location); + if (location.toLowerCase().endsWith(".xml")) { + properties.loadFromXML(in); + } else { + properties.load(in); + } + return properties; + } catch (IOException e) { + throw new MojoExecutionException("Error reading properties file " + location, e); + } finally { + IOUtils.closeQuietly(in); + } + } - /** - * Provides escape characters. - * - * @param escapeChars - * @return escape characters - */ - protected List getEscapeChars(String escapeChars) { - List tokens = getListFromCSV(escapeChars); - List realTokens = new ArrayList(); - for (String token : tokens) { - String realToken = getRealToken(token); - realTokens.add(realToken); - } - return realTokens; - } + /** + * Provides escape characters. + * + * @param escapeChars + * @return escape characters + */ + protected List getEscapeChars(String escapeChars) { + List tokens = getListFromCSV(escapeChars); + List realTokens = new ArrayList(); + for (String token : tokens) { + String realToken = getRealToken(token); + realTokens.add(realToken); + } + return realTokens; + } - /** - * Provides real token. - * - * @param token - * @return real token - */ - protected String getRealToken(String token) { - if (token.equalsIgnoreCase("CR")) { - return CR; - } else if (token.equalsIgnoreCase("LF")) { - return LF; - } else if (token.equalsIgnoreCase("TAB")) { - return TAB; - } else { - return token; - } - } + /** + * Provides real token. + * + * @param token + * @return real token + */ + protected String getRealToken(String token) { + if (token.equalsIgnoreCase("CR")) { + return CR; + } else if (token.equalsIgnoreCase("LF")) { + return LF; + } else if (token.equalsIgnoreCase("TAB")) { + return TAB; + } else { + return token; + } + } - /** - * Returns content. - * - * @param comment - * @param properties - * @param escapeTokens - * @return content - */ - protected String getContent(String comment, Properties properties, List escapeTokens) { - List names = new ArrayList(properties.stringPropertyNames()); - Collections.sort(names); - StringBuilder sb = new StringBuilder(); - if (!StringUtils.isBlank(comment)) { - sb.append(comment); - } - for (String name : names) { - String value = properties.getProperty(name); - String escapedValue = escape(value, escapeTokens); - sb.append(name + "=" + escapedValue + "\n"); - } - return sb.toString(); - } + /** + * Returns content. + * + * @param comment + * @param properties + * @param escapeTokens + * @return content + */ + protected String getContent(String comment, Properties properties, List escapeTokens) { + List names = new ArrayList(properties.stringPropertyNames()); + Collections.sort(names); + StringBuilder sb = new StringBuilder(); + if (!StringUtils.isBlank(comment)) { + sb.append(comment); + } + for (String name : names) { + String value = properties.getProperty(name); + String escapedValue = escape(value, escapeTokens); + sb.append(name + "=" + escapedValue + "\n"); + } + return sb.toString(); + } - /** - * Writes properties to given file. - * - * @param file - * @param comment - * @param properties - * @param escapeTokens - * @throws MojoExecutionException - */ - protected void writeProperties( - File file, String comment, Properties properties, List escapeTokens) - throws MojoExecutionException { - try { - String content = getContent(comment, properties, escapeTokens); - FileUtils.writeStringToFile(file, content, ENCODING_UTF8); - } catch (IOException e) { - throw new MojoExecutionException("Error creating properties file", e); - } - } + /** + * Writes properties to given file. + * + * @param file + * @param comment + * @param properties + * @param escapeTokens + * @throws MojoExecutionException + */ + protected void writeProperties( + File file, String comment, Properties properties, List escapeTokens) + throws MojoExecutionException { + try { + String content = getContent(comment, properties, escapeTokens); + FileUtils.writeStringToFile(file, content, ENCODING_UTF8); + } catch (IOException e) { + throw new MojoExecutionException("Error creating properties file", e); + } + } - /** - * Escapes characters. - * - * @param s - * @param escapeChars - * @return - */ - protected String escape(String s, List escapeChars) { - String result = s; - for (String escapeChar : escapeChars) { - result = result.replace(escapeChar, getReplacementToken(escapeChar)); - } - return result; - } + /** + * Escapes characters. + * + * @param s + * @param escapeChars + * @return + */ + protected String escape(String s, List escapeChars) { + String result = s; + for (String escapeChar : escapeChars) { + result = result.replace(escapeChar, getReplacementToken(escapeChar)); + } + return result; + } - /** - * Provides replacement token. - * - * @param escapeChar - * @return replacement token - */ - protected String getReplacementToken(String escapeChar) { - if (escapeChar.equals(CR)) { - return "\\r"; - } else if (escapeChar.equals(LF)) { - return "\\n"; - } else if (escapeChar.equals(TAB)) { - return "\\t"; - } else { - return "\\" + escapeChar; - } - } + /** + * Provides replacement token. + * + * @param escapeChar + * @return replacement token + */ + protected String getReplacementToken(String escapeChar) { + if (escapeChar.equals(CR)) { + return "\\r"; + } else if (escapeChar.equals(LF)) { + return "\\n"; + } else if (escapeChar.equals(TAB)) { + return "\\t"; + } else { + return "\\" + escapeChar; + } + } - /** - * Returns list from csv. - * - * @param csv - * @return list of values generated from CSV - */ - protected static final List getListFromCSV(String csv) { - if (StringUtils.isBlank(csv)) { - return new ArrayList(); - } - List list = new ArrayList(); - String[] tokens = StringUtils.split(csv, ","); - for (String token : tokens) { - list.add(token.trim()); - } - return list; - } + /** + * Returns list from csv. + * + * @param csv + * @return list of values generated from CSV + */ + protected static final List getListFromCSV(String csv) { + if (StringUtils.isBlank(csv)) { + return new ArrayList(); + } + List list = new ArrayList(); + String[] tokens = StringUtils.split(csv, ","); + for (String token : tokens) { + list.add(token.trim()); + } + return list; + } - public void setIncludeSystemProperties(boolean includeSystemProperties) { - this.includeSystemProperties = includeSystemProperties; - } + public void setIncludeSystemProperties(boolean includeSystemProperties) { + this.includeSystemProperties = includeSystemProperties; + } - public void setEscapeChars(String escapeChars) { - this.escapeChars = escapeChars; - } + public void setEscapeChars(String escapeChars) { + this.escapeChars = escapeChars; + } - public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) { - this.includeEnvironmentVariables = includeEnvironmentVariables; - } + public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) { + this.includeEnvironmentVariables = includeEnvironmentVariables; + } - public void setExclude(String exclude) { - this.exclude = exclude; - } + public void setExclude(String exclude) { + this.exclude = exclude; + } - public void setInclude(String include) { - this.include = include; - } + public void setInclude(String include) { + this.include = include; + } - public void setQuiet(boolean quiet) { - this.quiet = quiet; - } + public void setQuiet(boolean quiet) { + this.quiet = quiet; + } - /** - * Sets property files for which keys properties should be included. - * - * @param includePropertyKeysFromFiles - */ - public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { - if (includePropertyKeysFromFiles != null) { - this.includePropertyKeysFromFiles = - Arrays.copyOf(includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); - } - } + /** + * Sets property files for which keys properties should be included. + * + * @param includePropertyKeysFromFiles + */ + public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { + if (includePropertyKeysFromFiles != null) { + this.includePropertyKeysFromFiles = Arrays + .copyOf(includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); + } + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java index 3a0d5fcc74..b8075ba5dd 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.maven.plugin.properties; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; @@ -10,87 +11,87 @@ import org.junit.jupiter.api.Test; /** @author mhorst, claudio.atzori */ public class GenerateOoziePropertiesMojoTest { - private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); + private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); - @BeforeEach - public void clearSystemProperties() { - System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); - System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); - } + @BeforeEach + public void clearSystemProperties() { + System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); + System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); + } - @Test - public void testExecuteEmpty() throws Exception { - // execute - mojo.execute(); + @Test + public void testExecuteEmpty() throws Exception { + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteSandboxNameAlreadySet() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; - String sandboxName = "originalSandboxName"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); + @Test + public void testExecuteSandboxNameAlreadySet() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + String sandboxName = "originalSandboxName"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteEmptyWorkflowSourceDir() throws Exception { - // given - String workflowSourceDir = ""; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteEmptyWorkflowSourceDir() throws Exception { + // given + String workflowSourceDir = ""; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteNullSandboxNameGenerated() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteNullSandboxNameGenerated() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecute() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecute() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteWithoutRoot() throws Exception { - // given - String workflowSourceDir = "wf/transformers"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteWithoutRoot() throws Exception { + // given + String workflowSourceDir = "wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 1b247198b8..e0b2eff37b 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.maven.plugin.properties; import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; @@ -7,6 +8,7 @@ import static org.mockito.Mockito.lenient; import java.io.*; import java.util.Properties; + import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.project.MavenProject; import org.junit.jupiter.api.*; @@ -20,337 +22,353 @@ import org.mockito.junit.jupiter.MockitoExtension; @ExtendWith(MockitoExtension.class) public class WritePredefinedProjectPropertiesTest { - @Mock private MavenProject mavenProject; + @Mock + private MavenProject mavenProject; - private WritePredefinedProjectProperties mojo; + private WritePredefinedProjectProperties mojo; - @BeforeEach - public void init(@TempDir File testFolder) { - MockitoAnnotations.initMocks(this); - mojo = new WritePredefinedProjectProperties(); - mojo.outputFile = getPropertiesFileLocation(testFolder); - mojo.project = mavenProject; - lenient().doReturn(new Properties()).when(mavenProject).getProperties(); - } + @BeforeEach + public void init(@TempDir File testFolder) { + MockitoAnnotations.initMocks(this); + mojo = new WritePredefinedProjectProperties(); + mojo.outputFile = getPropertiesFileLocation(testFolder); + mojo.project = mavenProject; + lenient().doReturn(new Properties()).when(mavenProject).getProperties(); + } - // ----------------------------------- TESTS --------------------------------------------- + // ----------------------------------- TESTS --------------------------------------------- - @Test - public void testExecuteEmpty() throws Exception { - // execute - mojo.execute(); + @Test + public void testExecuteEmpty() throws Exception { + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); - assertEquals(0, storedProperties.size()); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(0, storedProperties.size()); + } - @Test - public void testExecuteWithProjectProperties() throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteWithProjectProperties() throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test() - public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.outputFile = testFolder; + @Test() + public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.outputFile = testFolder; - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String excludedKey = "excludedPropertyKey"; - String excludedValue = "excludedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(excludedKey, excludedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setExclude(excludedKey); + @Test + public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String excludedKey = "excludedPropertyKey"; + String excludedValue = "excludedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(excludedKey, excludedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setExclude(excludedKey); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test - public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setInclude(includedKey); + @Test + public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setInclude(includedKey); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.properties"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.store(new FileWriter(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.properties"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileWriter(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setIncludePropertyKeysFromFiles( - new String[] {"/eu/dnetlib/maven/plugin/properties/included.properties"}); + mojo + .setIncludePropertyKeysFromFiles( + new String[] { + "/eu/dnetlib/maven/plugin/properties/included.properties" + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromBlankLocation() { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromBlankLocation() { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setIncludePropertyKeysFromFiles(new String[] {""}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + "" + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.xml"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.xml"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.store(new FileOutputStream(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileOutputStream(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { - // given - mojo.setQuiet(true); - mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); + @Test + public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { + // given + mojo.setQuiet(true); + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(0, storedProperties.size()); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(0, storedProperties.size()); + } - @Test - public void testExecuteIncludingPropertyKeysFromInvalidFile() { - // given - mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); + @Test + public void testExecuteIncludingPropertyKeysFromInvalidFile() { + // given + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { - // given - mojo.setIncludeEnvironmentVariables(true); + @Test + public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { + // given + mojo.setIncludeEnvironmentVariables(true); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - for (Object currentKey : storedProperties.keySet()) { - assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); - } - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + for (Object currentKey : storedProperties.keySet()) { + assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); + } + } - @Test - public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { - // given - String key = "systemPropertyKey"; - String value = "systemPropertyValue"; - System.setProperty(key, value); - mojo.setIncludeSystemProperties(true); + @Test + public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { + // given + String key = "systemPropertyKey"; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test - public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) - throws Exception { - // given - String key = "systemPropertyKey "; - String value = "systemPropertyValue"; - System.setProperty(key, value); - mojo.setIncludeSystemProperties(true); - String escapeChars = "cr,lf,tab,|"; - mojo.setEscapeChars(escapeChars); + @Test + public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) + throws Exception { + // given + String key = "systemPropertyKey "; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); + String escapeChars = "cr,lf,tab,|"; + mojo.setEscapeChars(escapeChars); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - assertFalse(storedProperties.containsKey(key)); - assertTrue(storedProperties.containsKey(key.trim())); - assertEquals(value, storedProperties.getProperty(key.trim())); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertFalse(storedProperties.containsKey(key)); + assertTrue(storedProperties.containsKey(key.trim())); + assertEquals(value, storedProperties.getProperty(key.trim())); + } - // ----------------------------------- PRIVATE ------------------------------------------- + // ----------------------------------- PRIVATE ------------------------------------------- - private File getPropertiesFileLocation(File testFolder) { - return new File(testFolder, "test.properties"); - } + private File getPropertiesFileLocation(File testFolder) { + return new File(testFolder, "test.properties"); + } - private Properties getStoredProperties(File testFolder) - throws FileNotFoundException, IOException { - Properties properties = new Properties(); - properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); - return properties; - } + private Properties getStoredProperties(File testFolder) + throws FileNotFoundException, IOException { + Properties properties = new Properties(); + properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); + return properties; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java index 1a7c2a6efe..bfd70e8c63 100644 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java @@ -1,3 +1,4 @@ + package eu.dnetlib.collector.worker.model; import java.util.HashMap; @@ -5,43 +6,43 @@ import java.util.Map; public class ApiDescriptor { - private String id; + private String id; - private String baseUrl; + private String baseUrl; - private String protocol; + private String protocol; - private Map params = new HashMap<>(); + private Map params = new HashMap<>(); - public String getBaseUrl() { - return baseUrl; - } + public String getBaseUrl() { + return baseUrl; + } - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public Map getParams() { - return params; - } + public Map getParams() { + return params; + } - public void setParams(final HashMap params) { - this.params = params; - } + public void setParams(final HashMap params) { + this.params = params; + } - public String getProtocol() { - return protocol; - } + public String getProtocol() { + return protocol; + } - public void setProtocol(final String protocol) { - this.protocol = protocol; - } + public void setProtocol(final String protocol) { + this.protocol = protocol; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index f076bd188b..68fc024afa 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.UUID; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -11,107 +13,107 @@ import javax.persistence.Table; @Table(name = "mdstores") public class MDStore implements Serializable { - /** */ - private static final long serialVersionUID = 3160530489149700055L; + /** */ + private static final long serialVersionUID = 3160530489149700055L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public static MDStore newInstance( - final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); - } + public static MDStore newInstance( + final String format, final String layout, final String interpretation) { + return newInstance(format, layout, interpretation, null, null, null); + } - public static MDStore newInstance( - final String format, - final String layout, - final String interpretation, - final String dsName, - final String dsId, - final String apiId) { - final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); - md.setFormat(format); - md.setLayout(layout); - md.setInterpretation(interpretation); - md.setDatasourceName(dsName); - md.setDatasourceId(dsId); - md.setApiId(apiId); - return md; - } + public static MDStore newInstance( + final String format, + final String layout, + final String interpretation, + final String dsName, + final String dsId, + final String apiId) { + final MDStore md = new MDStore(); + md.setId("md-" + UUID.randomUUID()); + md.setFormat(format); + md.setLayout(layout); + md.setInterpretation(interpretation); + md.setDatasourceName(dsName); + md.setDatasourceId(dsId); + md.setApiId(apiId); + return md; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index 0f8f043224..f74ab39be2 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -1,6 +1,8 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -10,40 +12,40 @@ import javax.persistence.Table; @Table(name = "mdstore_current_versions") public class MDStoreCurrentVersion implements Serializable { - /** */ - private static final long serialVersionUID = -4757725888593745773L; + /** */ + private static final long serialVersionUID = -4757725888593745773L; - @Id - @Column(name = "mdstore") - private String mdstore; + @Id + @Column(name = "mdstore") + private String mdstore; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { - final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); - cv.setMdstore(mdId); - cv.setCurrentVersion(versionId); - return cv; - } + public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { + final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); + cv.setMdstore(mdId); + cv.setCurrentVersion(versionId); + return cv; + } - public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { - return newInstance(v.getMdstore(), v.getId()); - } + public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { + return newInstance(v.getMdstore(), v.getId()); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index ca784b2fb2..7ef24f1916 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -13,85 +15,85 @@ import javax.persistence.TemporalType; @Table(name = "mdstore_versions") public class MDStoreVersion implements Serializable { - /** */ - private static final long serialVersionUID = -4763494442274298339L; + /** */ + private static final long serialVersionUID = -4763494442274298339L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "mdstore") - private String mdstore; + @Column(name = "mdstore") + private String mdstore; - @Column(name = "writing") - private boolean writing; + @Column(name = "writing") + private boolean writing; - @Column(name = "readcount") - private int readCount = 0; + @Column(name = "readcount") + private int readCount = 0; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; - } + public static MDStoreVersion newInstance(final String mdId, final boolean writing) { + final MDStoreVersion t = new MDStoreVersion(); + t.setId(mdId + "-" + new Date().getTime()); + t.setMdstore(mdId); + t.setLastUpdate(null); + t.setWriting(writing); + t.setReadCount(0); + t.setSize(0); + return t; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public boolean isWriting() { - return writing; - } + public boolean isWriting() { + return writing; + } - public void setWriting(final boolean writing) { - this.writing = writing; - } + public void setWriting(final boolean writing) { + this.writing = writing; + } - public int getReadCount() { - return readCount; - } + public int getReadCount() { + return readCount; + } - public void setReadCount(final int readCount) { - this.readCount = readCount; - } + public void setReadCount(final int readCount) { + this.readCount = readCount; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } + public long getSize() { + return size; + } - public void setSize(final long size) { - this.size = size; - } + public void setSize(final long size) { + this.size = size; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index 9225a4876d..4383592410 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -13,129 +15,129 @@ import javax.persistence.TemporalType; @Table(name = "mdstores_with_info") public class MDStoreWithInfo implements Serializable { - /** */ - private static final long serialVersionUID = -8445784770687571492L; + /** */ + private static final long serialVersionUID = -8445784770687571492L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - @Column(name = "n_versions") - private long numberOfVersions = 0; + @Column(name = "n_versions") + private long numberOfVersions = 0; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } + public long getSize() { + return size; + } - public void setSize(final long size) { - this.size = size; - } + public void setSize(final long size) { + this.size = size; + } - public long getNumberOfVersions() { - return numberOfVersions; - } + public long getNumberOfVersions() { + return numberOfVersions; + } - public void setNumberOfVersions(final long numberOfVersions) { - this.numberOfVersions = numberOfVersions; - } + public void setNumberOfVersions(final long numberOfVersions) { + this.numberOfVersions = numberOfVersions; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index d98874bf3e..e65b4bb0bf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,6 +1,6 @@ + package eu.dnetlib.dhp.application; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Serializable; @@ -8,87 +8,91 @@ import java.io.StringWriter; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; + import org.apache.commons.cli.*; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class ArgumentApplicationParser implements Serializable { - private final Options options = new Options(); - private final Map objectMap = new HashMap<>(); + private final Options options = new Options(); + private final Map objectMap = new HashMap<>(); - private final List compressedValues = new ArrayList<>(); + private final List compressedValues = new ArrayList<>(); - public ArgumentApplicationParser(final String json_configuration) throws Exception { - final ObjectMapper mapper = new ObjectMapper(); - final OptionsParameter[] configuration = - mapper.readValue(json_configuration, OptionsParameter[].class); - createOptionMap(configuration); - } + public ArgumentApplicationParser(final String json_configuration) throws Exception { + final ObjectMapper mapper = new ObjectMapper(); + final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); + createOptionMap(configuration); + } - public ArgumentApplicationParser(final OptionsParameter[] configuration) { - createOptionMap(configuration); - } + public ArgumentApplicationParser(final OptionsParameter[] configuration) { + createOptionMap(configuration); + } - private void createOptionMap(final OptionsParameter[] configuration) { + private void createOptionMap(final OptionsParameter[] configuration) { - Arrays.stream(configuration) - .map( - conf -> { - final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); - o.setLongOpt(conf.getParamLongName()); - o.setRequired(conf.isParamRequired()); - if (conf.isCompressed()) { - compressedValues.add(conf.getParamLongName()); - } - return o; - }) - .forEach(options::addOption); + Arrays + .stream(configuration) + .map( + conf -> { + final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); + o.setLongOpt(conf.getParamLongName()); + o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } + return o; + }) + .forEach(options::addOption); - // HelpFormatter formatter = new HelpFormatter(); - // formatter.printHelp("myapp", null, options, null, true); + // HelpFormatter formatter = new HelpFormatter(); + // formatter.printHelp("myapp", null, options, null, true); - } + } - public static String decompressValue(final String abstractCompressed) { - try { - byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); - GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); - final StringWriter stringWriter = new StringWriter(); - IOUtils.copy(gis, stringWriter); - return stringWriter.toString(); - } catch (Throwable e) { - System.out.println("Wrong value to decompress:" + abstractCompressed); - throw new RuntimeException(e); - } - } + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (Throwable e) { + System.out.println("Wrong value to decompress:" + abstractCompressed); + throw new RuntimeException(e); + } + } - public static String compressArgument(final String value) throws Exception { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - GZIPOutputStream gzip = new GZIPOutputStream(out); - gzip.write(value.getBytes()); - gzip.close(); - return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); - } + public static String compressArgument(final String value) throws Exception { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } - public void parseArgument(final String[] args) throws Exception { - CommandLineParser parser = new BasicParser(); - CommandLine cmd = parser.parse(options, args); - Arrays.stream(cmd.getOptions()) - .forEach( - it -> - objectMap.put( - it.getLongOpt(), - compressedValues.contains(it.getLongOpt()) - ? decompressValue(it.getValue()) - : it.getValue())); - } + public void parseArgument(final String[] args) throws Exception { + CommandLineParser parser = new BasicParser(); + CommandLine cmd = parser.parse(options, args); + Arrays + .stream(cmd.getOptions()) + .forEach( + it -> objectMap + .put( + it.getLongOpt(), + compressedValues.contains(it.getLongOpt()) + ? decompressValue(it.getValue()) + : it.getValue())); + } - public String get(final String key) { - return objectMap.get(key); - } + public String get(final String key) { + return objectMap.get(key); + } - public Map getObjectMap() { - return objectMap; - } + public Map getObjectMap() { + return objectMap; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java index 03227d3168..7004112e42 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java @@ -1,36 +1,38 @@ + package eu.dnetlib.dhp.application; public class OptionsParameter { - private String paramName; - private String paramLongName; - private String paramDescription; - private boolean paramRequired; - private boolean compressed; + private String paramName; + private String paramLongName; + private String paramDescription; + private boolean paramRequired; + private boolean compressed; - public OptionsParameter() {} + public OptionsParameter() { + } - public String getParamName() { - return paramName; - } + public String getParamName() { + return paramName; + } - public String getParamLongName() { - return paramLongName; - } + public String getParamLongName() { + return paramLongName; + } - public String getParamDescription() { - return paramDescription; - } + public String getParamDescription() { + return paramDescription; + } - public boolean isParamRequired() { - return paramRequired; - } + public boolean isParamRequired() { + return paramRequired; + } - public boolean isCompressed() { - return compressed; - } + public boolean isCompressed() { + return compressed; + } - public void setCompressed(boolean compressed) { - this.compressed = compressed; - } + public void setCompressed(boolean compressed) { + this.compressed = compressed; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java index 4b0e1506e5..e793e3f295 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import java.io.Serializable; @@ -6,46 +7,48 @@ import java.util.function.Supplier; /** Provides serializable and throwing extensions to standard functional interfaces. */ public class FunctionalInterfaceSupport { - private FunctionalInterfaceSupport() {} + private FunctionalInterfaceSupport() { + } - /** - * Serializable supplier of any kind of objects. To be used withing spark processing pipelines - * when supplying functions externally. - * - * @param - */ - @FunctionalInterface - public interface SerializableSupplier extends Supplier, Serializable {} + /** + * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying + * functions externally. + * + * @param + */ + @FunctionalInterface + public interface SerializableSupplier extends Supplier, Serializable { + } - /** - * Extension of consumer accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingConsumer { - void accept(T t) throws E; - } + /** + * Extension of consumer accepting functions throwing an exception. + * + * @param + * @param + */ + @FunctionalInterface + public interface ThrowingConsumer { + void accept(T t) throws E; + } - /** - * Extension of supplier accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingSupplier { - T get() throws E; - } + /** + * Extension of supplier accepting functions throwing an exception. + * + * @param + * @param + */ + @FunctionalInterface + public interface ThrowingSupplier { + T get() throws E; + } - /** - * Extension of runnable accepting functions throwing an exception. - * - * @param - */ - @FunctionalInterface - public interface ThrowingRunnable { - void run() throws E; - } + /** + * Extension of runnable accepting functions throwing an exception. + * + * @param + */ + @FunctionalInterface + public interface ThrowingRunnable { + void run() throws E; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java index 1e5c264d11..0b2cd571fa 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; @@ -5,6 +6,7 @@ import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -14,58 +16,59 @@ import org.slf4j.LoggerFactory; /** HDFS utility methods. */ public class HdfsSupport { - private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class); + private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class); - private HdfsSupport() {} + private HdfsSupport() { + } - /** - * Checks a path (file or dir) exists on HDFS. - * - * @param path Path to be checked - * @param configuration Configuration of hadoop env - */ - public static boolean exists(String path, Configuration configuration) { - logger.info("Removing path: {}", path); - return rethrowAsRuntimeException( - () -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - return fileSystem.exists(f); - }); - } + /** + * Checks a path (file or dir) exists on HDFS. + * + * @param path Path to be checked + * @param configuration Configuration of hadoop env + */ + public static boolean exists(String path, Configuration configuration) { + logger.info("Removing path: {}", path); + return rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + return fileSystem.exists(f); + }); + } - /** - * Removes a path (file or dir) from HDFS. - * - * @param path Path to be removed - * @param configuration Configuration of hadoop env - */ - public static void remove(String path, Configuration configuration) { - logger.info("Removing path: {}", path); - rethrowAsRuntimeException( - () -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - if (fileSystem.exists(f)) { - fileSystem.delete(f, true); - } - }); - } + /** + * Removes a path (file or dir) from HDFS. + * + * @param path Path to be removed + * @param configuration Configuration of hadoop env + */ + public static void remove(String path, Configuration configuration) { + logger.info("Removing path: {}", path); + rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + if (fileSystem.exists(f)) { + fileSystem.delete(f, true); + } + }); + } - /** - * Lists hadoop files located below path or alternatively lists subdirs under path. - * - * @param path Path to be listed for hadoop files - * @param configuration Configuration of hadoop env - * @return List with string locations of hadoop files - */ - public static List listFiles(String path, Configuration configuration) { - logger.info("Listing files in path: {}", path); - return rethrowAsRuntimeException( - () -> - Arrays.stream(FileSystem.get(configuration).listStatus(new Path(path))) - .filter(FileStatus::isDirectory) - .map(x -> x.getPath().toString()) - .collect(Collectors.toList())); - } + /** + * Lists hadoop files located below path or alternatively lists subdirs under path. + * + * @param path Path to be listed for hadoop files + * @param configuration Configuration of hadoop env + * @return List with string locations of hadoop files + */ + public static List listFiles(String path, Configuration configuration) { + logger.info("Listing files in path: {}", path); + return rethrowAsRuntimeException( + () -> Arrays + .stream(FileSystem.get(configuration).listStatus(new Path(path))) + .filter(FileStatus::isDirectory) + .map(x -> x.getPath().toString()) + .collect(Collectors.toList())); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java index 433f64ecdf..03cc949615 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java @@ -1,74 +1,75 @@ + package eu.dnetlib.dhp.common; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; import java.util.Objects; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; + /** SparkSession utility methods. */ public class SparkSessionSupport { - private SparkSessionSupport() {} + private SparkSessionSupport() { + } - /** - * Runs a given function using SparkSession created using default builder and supplied SparkConf. - * Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created - * externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); - } + /** + * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession + * when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkSession( + SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); + } - /** - * Runs a given function using SparkSession created with hive support and using default builder - * and supplied SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse - * SparkSession created externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkHiveSession( - SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), - conf, - isSparkSessionManaged, - fn); - } + /** + * Runs a given function using SparkSession created with hive support and using default builder and supplied + * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkHiveSession( + SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), + conf, + isSparkSessionManaged, + fn); + } - /** - * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. - * Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created - * externally. - * - * @param sparkSessionBuilder Builder of SparkSession - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - Function sparkSessionBuilder, - SparkConf conf, - Boolean isSparkSessionManaged, - ThrowingConsumer fn) { - SparkSession spark = null; - try { - spark = sparkSessionBuilder.apply(conf); - fn.accept(spark); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - if (Objects.nonNull(spark) && isSparkSessionManaged) { - spark.stop(); - } - } - } + /** + * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops + * SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param sparkSessionBuilder Builder of SparkSession + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkSession( + Function sparkSessionBuilder, + SparkConf conf, + Boolean isSparkSessionManaged, + ThrowingConsumer fn) { + SparkSession spark = null; + try { + spark = sparkSessionBuilder.apply(conf); + fn.accept(spark); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + if (Objects.nonNull(spark) && isSparkSessionManaged) { + spark.stop(); + } + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java index 54342a46ac..f3f59b2a21 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingRunnable; @@ -6,69 +7,70 @@ import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingSupplier; /** Exception handling utility methods. */ public class ThrowingSupport { - private ThrowingSupport() {} + private ThrowingSupport() { + } - /** - * Executes given runnable and rethrows any exceptions as RuntimeException. - * - * @param fn Runnable to be executed - * @param Type of exception thrown - */ - public static void rethrowAsRuntimeException(ThrowingRunnable fn) { - try { - fn.run(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } + /** + * Executes given runnable and rethrows any exceptions as RuntimeException. + * + * @param fn Runnable to be executed + * @param Type of exception thrown + */ + public static void rethrowAsRuntimeException(ThrowingRunnable fn) { + try { + fn.run(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } - /** - * Executes given runnable and rethrows any exceptions as RuntimeException with custom message. - * - * @param fn Runnable to be executed - * @param msg Message to be set for rethrown exception - * @param Type of exception thrown - */ - public static void rethrowAsRuntimeException( - ThrowingRunnable fn, String msg) { - try { - fn.run(); - } catch (Exception e) { - throw new RuntimeException(msg, e); - } - } + /** + * Executes given runnable and rethrows any exceptions as RuntimeException with custom message. + * + * @param fn Runnable to be executed + * @param msg Message to be set for rethrown exception + * @param Type of exception thrown + */ + public static void rethrowAsRuntimeException( + ThrowingRunnable fn, String msg) { + try { + fn.run(); + } catch (Exception e) { + throw new RuntimeException(msg, e); + } + } - /** - * Executes given supplier and rethrows any exceptions as RuntimeException. - * - * @param fn Supplier to be executed - * @param Type of returned value - * @param Type of exception thrown - * @return Result of supplier execution - */ - public static T rethrowAsRuntimeException(ThrowingSupplier fn) { - try { - return fn.get(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } + /** + * Executes given supplier and rethrows any exceptions as RuntimeException. + * + * @param fn Supplier to be executed + * @param Type of returned value + * @param Type of exception thrown + * @return Result of supplier execution + */ + public static T rethrowAsRuntimeException(ThrowingSupplier fn) { + try { + return fn.get(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } - /** - * Executes given supplier and rethrows any exceptions as RuntimeException with custom message. - * - * @param fn Supplier to be executed - * @param msg Message to be set for rethrown exception - * @param Type of returned value - * @param Type of exception thrown - * @return Result of supplier execution - */ - public static T rethrowAsRuntimeException( - ThrowingSupplier fn, String msg) { - try { - return fn.get(); - } catch (Exception e) { - throw new RuntimeException(msg, e); - } - } + /** + * Executes given supplier and rethrows any exceptions as RuntimeException with custom message. + * + * @param fn Supplier to be executed + * @param msg Message to be set for rethrown exception + * @param Type of returned value + * @param Type of exception thrown + * @return Result of supplier execution + */ + public static T rethrowAsRuntimeException( + ThrowingSupplier fn, String msg) { + try { + return fn.get(); + } catch (Exception e) { + throw new RuntimeException(msg, e); + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java index 56d7217ff4..ce65e710f9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java @@ -1,120 +1,121 @@ + package eu.dnetlib.dhp.model.mdstore; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.Serializable; +import eu.dnetlib.dhp.utils.DHPUtils; + /** This class models a record inside the new Metadata store collection on HDFS * */ public class MetadataRecord implements Serializable { - /** The D-Net Identifier associated to the record */ - private String id; + /** The D-Net Identifier associated to the record */ + private String id; - /** The original Identifier of the record */ - private String originalId; + /** The original Identifier of the record */ + private String originalId; - /** The encoding of the record, should be JSON or XML */ - private String encoding; + /** The encoding of the record, should be JSON or XML */ + private String encoding; - /** - * The information about the provenance of the record see @{@link Provenance} for the model of - * this information - */ - private Provenance provenance; + /** + * The information about the provenance of the record see @{@link Provenance} for the model of this information + */ + private Provenance provenance; - /** The content of the metadata */ - private String body; + /** The content of the metadata */ + private String body; - /** the date when the record has been stored */ - private long dateOfCollection; + /** the date when the record has been stored */ + private long dateOfCollection; - /** the date when the record has been stored */ - private long dateOfTransformation; + /** the date when the record has been stored */ + private long dateOfTransformation; - public MetadataRecord() { - this.dateOfCollection = System.currentTimeMillis(); - } + public MetadataRecord() { + this.dateOfCollection = System.currentTimeMillis(); + } - public MetadataRecord( - String originalId, - String encoding, - Provenance provenance, - String body, - long dateOfCollection) { + public MetadataRecord( + String originalId, + String encoding, + Provenance provenance, + String body, + long dateOfCollection) { - this.originalId = originalId; - this.encoding = encoding; - this.provenance = provenance; - this.body = body; - this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); - } + this.originalId = originalId; + this.encoding = encoding; + this.provenance = provenance; + this.body = body; + this.dateOfCollection = dateOfCollection; + this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getOriginalId() { - return originalId; - } + public String getOriginalId() { + return originalId; + } - public void setOriginalId(String originalId) { - this.originalId = originalId; - } + public void setOriginalId(String originalId) { + this.originalId = originalId; + } - public String getEncoding() { - return encoding; - } + public String getEncoding() { + return encoding; + } - public void setEncoding(String encoding) { - this.encoding = encoding; - } + public void setEncoding(String encoding) { + this.encoding = encoding; + } - public Provenance getProvenance() { - return provenance; - } + public Provenance getProvenance() { + return provenance; + } - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } + public void setProvenance(Provenance provenance) { + this.provenance = provenance; + } - public String getBody() { - return body; - } + public String getBody() { + return body; + } - public void setBody(String body) { - this.body = body; - } + public void setBody(String body) { + this.body = body; + } - public long getDateOfCollection() { - return dateOfCollection; - } + public long getDateOfCollection() { + return dateOfCollection; + } - public void setDateOfCollection(long dateOfCollection) { - this.dateOfCollection = dateOfCollection; - } + public void setDateOfCollection(long dateOfCollection) { + this.dateOfCollection = dateOfCollection; + } - public long getDateOfTransformation() { - return dateOfTransformation; - } + public long getDateOfTransformation() { + return dateOfTransformation; + } - public void setDateOfTransformation(long dateOfTransformation) { - this.dateOfTransformation = dateOfTransformation; - } + public void setDateOfTransformation(long dateOfTransformation) { + this.dateOfTransformation = dateOfTransformation; + } - @Override - public boolean equals(Object o) { - if (!(o instanceof MetadataRecord)) { - return false; - } - return ((MetadataRecord) o).getId().equalsIgnoreCase(id); - } + @Override + public boolean equals(Object o) { + if (!(o instanceof MetadataRecord)) { + return false; + } + return ((MetadataRecord) o).getId().equalsIgnoreCase(id); + } - @Override - public int hashCode() { - return id.hashCode(); - } + @Override + public int hashCode() { + return id.hashCode(); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java index 90897c5c42..556535022e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java @@ -1,49 +1,52 @@ + package eu.dnetlib.dhp.model.mdstore; import java.io.Serializable; /** * @author Sandro La Bruzzo - *

Provenace class models the provenance of the record in the metadataStore It contains the - * identifier and the name of the datasource that gives the record + *

+ * Provenace class models the provenance of the record in the metadataStore It contains the identifier and the + * name of the datasource that gives the record */ public class Provenance implements Serializable { - private String datasourceId; + private String datasourceId; - private String datasourceName; + private String datasourceName; - private String nsPrefix; + private String nsPrefix; - public Provenance() {} + public Provenance() { + } - public Provenance(String datasourceId, String datasourceName, String nsPrefix) { - this.datasourceId = datasourceId; - this.datasourceName = datasourceName; - this.nsPrefix = nsPrefix; - } + public Provenance(String datasourceId, String datasourceName, String nsPrefix) { + this.datasourceId = datasourceId; + this.datasourceName = datasourceName; + this.nsPrefix = nsPrefix; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(String datasourceName) { + this.datasourceName = datasourceName; + } - public String getNsPrefix() { - return nsPrefix; - } + public String getNsPrefix() { + return nsPrefix; + } - public void setNsPrefix(String nsPrefix) { - this.nsPrefix = nsPrefix; - } + public void setNsPrefix(String nsPrefix) { + this.nsPrefix = nsPrefix; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java index 3576dc92b5..22945309c0 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java @@ -1,12 +1,13 @@ + package eu.dnetlib.dhp.parser.utility; public class VtdException extends Exception { - public VtdException(final Exception e) { - super(e); - } + public VtdException(final Exception e) { + super(e); + } - public VtdException(final Throwable e) { - super(e); - } + public VtdException(final Throwable e) { + super(e); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java index a12662d1f1..9ac0a0bf71 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java @@ -1,105 +1,110 @@ + package eu.dnetlib.dhp.parser.utility; -import com.ximpleware.AutoPilot; -import com.ximpleware.VTDNav; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import com.ximpleware.AutoPilot; +import com.ximpleware.VTDNav; + /** Created by sandro on 9/29/16. */ public class VtdUtilityParser { - public static List getTextValuesWithAttributes( - final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) - throws VtdException { - final List results = new ArrayList<>(); - try { - ap.selectXPath(xpath); + public static List getTextValuesWithAttributes( + final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) + throws VtdException { + final List results = new ArrayList<>(); + try { + ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - final Node currentNode = new Node(); - int t = vn.getText(); - if (t >= 0) { - currentNode.setTextValue(vn.toNormalizedString(t)); - } - currentNode.setAttributes(getAttributes(vn, attributes)); - results.add(currentNode); - } - return results; - } catch (Exception e) { - throw new VtdException(e); - } - } + while (ap.evalXPath() != -1) { + final Node currentNode = new Node(); + int t = vn.getText(); + if (t >= 0) { + currentNode.setTextValue(vn.toNormalizedString(t)); + } + currentNode.setAttributes(getAttributes(vn, attributes)); + results.add(currentNode); + } + return results; + } catch (Exception e) { + throw new VtdException(e); + } + } - private static Map getAttributes(final VTDNav vn, final List attributes) { - final Map currentAttributes = new HashMap<>(); - if (attributes != null) { + private static Map getAttributes(final VTDNav vn, final List attributes) { + final Map currentAttributes = new HashMap<>(); + if (attributes != null) { - attributes.forEach( - attributeKey -> { - try { - int attr = vn.getAttrVal(attributeKey); - if (attr > -1) { - currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } - return currentAttributes; - } + attributes + .forEach( + attributeKey -> { + try { + int attr = vn.getAttrVal(attributeKey); + if (attr > -1) { + currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + return currentAttributes; + } - public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) - throws VtdException { - List results = new ArrayList<>(); - try { - ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t > -1) results.add(vn.toNormalizedString(t)); - } - return results; - } catch (Exception e) { - throw new VtdException(e); - } - } + public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) + throws VtdException { + List results = new ArrayList<>(); + try { + ap.selectXPath(xpath); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t > -1) + results.add(vn.toNormalizedString(t)); + } + return results; + } catch (Exception e) { + throw new VtdException(e); + } + } - public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) - throws VtdException { - try { - ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - int it = nav.getText(); - if (it > -1) return nav.toNormalizedString(it); - } - return null; - } catch (Exception e) { - throw new VtdException(e); - } - } + public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) + throws VtdException { + try { + ap.selectXPath(xpath); + while (ap.evalXPath() != -1) { + int it = nav.getText(); + if (it > -1) + return nav.toNormalizedString(it); + } + return null; + } catch (Exception e) { + throw new VtdException(e); + } + } - public static class Node { + public static class Node { - private String textValue; + private String textValue; - private Map attributes; + private Map attributes; - public String getTextValue() { - return textValue; - } + public String getTextValue() { + return textValue; + } - public void setTextValue(final String textValue) { - this.textValue = textValue; - } + public void setTextValue(final String textValue) { + this.textValue = textValue; + } - public Map getAttributes() { - return attributes; - } + public Map getAttributes() { + return attributes; + } - public void setAttributes(final Map attributes) { - this.attributes = attributes; - } - } + public void setAttributes(final Map attributes) { + this.attributes = attributes; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index f5800cdaf4..18e489a21c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,70 +1,75 @@ + package eu.dnetlib.dhp.utils; -import com.jayway.jsonpath.JsonPath; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import net.minidev.json.JSONArray; + import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; +import com.jayway.jsonpath.JsonPath; + +import net.minidev.json.JSONArray; + public class DHPUtils { - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static String generateIdentifier(final String originalId, final String nsPrefix) { - return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); - } + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); + } - public static String compressString(final String input) { - try (ByteArrayOutputStream out = new ByteArrayOutputStream(); - Base64OutputStream b64os = new Base64OutputStream(out)) { - GZIPOutputStream gzip = new GZIPOutputStream(b64os); - gzip.write(input.getBytes(StandardCharsets.UTF_8)); - gzip.close(); - return out.toString(); - } catch (Throwable e) { - return null; - } - } + public static String compressString(final String input) { + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + Base64OutputStream b64os = new Base64OutputStream(out)) { + GZIPOutputStream gzip = new GZIPOutputStream(b64os); + gzip.write(input.getBytes(StandardCharsets.UTF_8)); + gzip.close(); + return out.toString(); + } catch (Throwable e) { + return null; + } + } - public static String decompressString(final String input) { - byte[] byteArray = Base64.decodeBase64(input.getBytes()); - int len; - try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); - ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { - byte[] buffer = new byte[1024]; - while ((len = gis.read(buffer)) != -1) { - bos.write(buffer, 0, len); - } - return bos.toString(); - } catch (Exception e) { - return null; - } - } + public static String decompressString(final String input) { + byte[] byteArray = Base64.decodeBase64(input.getBytes()); + int len; + try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); + ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { + byte[] buffer = new byte[1024]; + while ((len = gis.read(buffer)) != -1) { + bos.write(buffer, 0, len); + } + return bos.toString(); + } catch (Exception e) { + return null; + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return o.toString(); - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return o.toString(); + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java index b6f3f111ac..97fe4b9d83 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -1,24 +1,26 @@ + package eu.dnetlib.dhp.utils; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class ISLookupClientFactory { - private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); + private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); - public static ISLookUpService getLookUpService(final String isLookupUrl) { - return getServiceStub(ISLookUpService.class, isLookupUrl); - } + public static ISLookUpService getLookUpService(final String isLookupUrl) { + return getServiceStub(ISLookUpService.class, isLookupUrl); + } - @SuppressWarnings("unchecked") - private static T getServiceStub(final Class clazz, final String endpoint) { - log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); - final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); - jaxWsProxyFactory.setServiceClass(clazz); - jaxWsProxyFactory.setAddress(endpoint); - return (T) jaxWsProxyFactory.create(); - } + @SuppressWarnings("unchecked") + private static T getServiceStub(final Class clazz, final String endpoint) { + log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); + final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); + jaxWsProxyFactory.setServiceClass(clazz); + jaxWsProxyFactory.setAddress(endpoint); + return (T) jaxWsProxyFactory.create(); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java index 57bd130cba..9b00b908c1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.utils.saxon; import net.sf.saxon.expr.XPathContext; @@ -9,25 +10,24 @@ import net.sf.saxon.trans.XPathException; public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { - public static String DEFAULT_SAXON_EXT_NS_URI = - "http://www.d-net.research-infrastructures.eu/saxon-extension"; + public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; - public abstract String getName(); + public abstract String getName(); - public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; - @Override - public StructuredQName getFunctionQName() { - return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); - } + @Override + public StructuredQName getFunctionQName() { + return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); + } - @Override - public ExtensionFunctionCall makeCallExpression() { - return new ExtensionFunctionCall() { - @Override - public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { - return doCall(context, arguments); - } - }; - } + @Override + public ExtensionFunctionCall makeCallExpression() { + return new ExtensionFunctionCall() { + @Override + public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { + return doCall(context, arguments); + } + }; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java index 38ecb6377f..c7e311b02a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -1,9 +1,11 @@ + package eu.dnetlib.dhp.utils.saxon; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.GregorianCalendar; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; @@ -13,55 +15,59 @@ import net.sf.saxon.value.StringValue; public class ExtractYear extends AbstractExtensionFunction { - private static final String[] dateFormats = {"yyyy-MM-dd", "yyyy/MM/dd"}; + private static final String[] dateFormats = { + "yyyy-MM-dd", "yyyy/MM/dd" + }; - @Override - public String getName() { - return "extractYear"; - } + @Override + public String getName() { + return "extractYear"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } - final Item item = arguments[0].head(); - if (item == null) { - return new StringValue(""); - } - return new StringValue(_year(item.getStringValue())); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + final Item item = arguments[0].head(); + if (item == null) { + return new StringValue(""); + } + return new StringValue(_year(item.getStringValue())); + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 1; - } + @Override + public int getMaximumNumberOfArguments() { + return 1; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } - private String _year(String s) { - Calendar c = new GregorianCalendar(); - for (String format : dateFormats) { - try { - c.setTime(new SimpleDateFormat(format).parse(s)); - String year = String.valueOf(c.get(Calendar.YEAR)); - return year; - } catch (ParseException e) { - } - } - return ""; - } + private String _year(String s) { + Calendar c = new GregorianCalendar(); + for (String format : dateFormats) { + try { + c.setTime(new SimpleDateFormat(format).parse(s)); + String year = String.valueOf(c.get(Calendar.YEAR)); + return year; + } catch (ParseException e) { + } + } + return ""; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index def4fdfc7b..4a719909a5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -1,8 +1,10 @@ + package eu.dnetlib.dhp.utils.saxon; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; @@ -11,57 +13,59 @@ import net.sf.saxon.value.StringValue; public class NormalizeDate extends AbstractExtensionFunction { - private static final String[] normalizeDateFormats = { - "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" - }; + private static final String[] normalizeDateFormats = { + "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" + }; - private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); - @Override - public String getName() { - return "normalizeDate"; - } + @Override + public String getName() { + return "normalizeDate"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } - String s = arguments[0].head().getStringValue(); - return new StringValue(_year(s)); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s = arguments[0].head().getStringValue(); + return new StringValue(_year(s)); + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 1; - } + @Override + public int getMaximumNumberOfArguments() { + return 1; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } - private String _year(String s) { - final String date = s != null ? s.trim() : ""; + private String _year(String s) { + final String date = s != null ? s.trim() : ""; - for (String format : normalizeDateFormats) { - try { - Date parse = new SimpleDateFormat(format).parse(date); - String res = new SimpleDateFormat(normalizeOutFormat).format(parse); - return res; - } catch (ParseException e) { - } - } - return ""; - } + for (String format : normalizeDateFormats) { + try { + Date parse = new SimpleDateFormat(format).parse(date); + String res = new SimpleDateFormat(normalizeOutFormat).format(parse); + return res; + } catch (ParseException e) { + } + } + return ""; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java index 73159c6179..46ecafd0aa 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -1,59 +1,63 @@ + package eu.dnetlib.dhp.utils.saxon; +import org.apache.commons.lang3.StringUtils; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; import net.sf.saxon.value.SequenceType; import net.sf.saxon.value.StringValue; -import org.apache.commons.lang3.StringUtils; public class PickFirst extends AbstractExtensionFunction { - @Override - public String getName() { - return "pickFirst"; - } + @Override + public String getName() { + return "pickFirst"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } - final String s1 = getValue(arguments[0]); - final String s2 = getValue(arguments[1]); + final String s1 = getValue(arguments[0]); + final String s2 = getValue(arguments[1]); - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); - } + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } - private String getValue(final Sequence arg) throws XPathException { - if (arg != null) { - final Item item = arg.head(); - if (item != null) { - return item.getStringValue(); - } - } - return ""; - } + private String getValue(final Sequence arg) throws XPathException { + if (arg != null) { + final Item item = arg.head(); + if (item != null) { + return item.getStringValue(); + } + } + return ""; + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 2; - } + @Override + public int getMaximumNumberOfArguments() { + return 2; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java index 18ce51887b..b85d866f11 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -1,29 +1,32 @@ + package eu.dnetlib.dhp.utils.saxon; import java.io.StringReader; + import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamSource; + import net.sf.saxon.Configuration; import net.sf.saxon.TransformerFactoryImpl; public class SaxonTransformerFactory { - /** - * Creates the index record transformer from the given XSLT - * - * @param xslt - * @return - * @throws TransformerException - */ - public static Transformer newInstance(final String xslt) throws TransformerException { + /** + * Creates the index record transformer from the given XSLT + * + * @param xslt + * @return + * @throws TransformerException + */ + public static Transformer newInstance(final String xslt) throws TransformerException { - final TransformerFactoryImpl factory = new TransformerFactoryImpl(); - final Configuration conf = factory.getConfiguration(); - conf.registerExtensionFunction(new ExtractYear()); - conf.registerExtensionFunction(new NormalizeDate()); - conf.registerExtensionFunction(new PickFirst()); + final TransformerFactoryImpl factory = new TransformerFactoryImpl(); + final Configuration conf = factory.getConfiguration(); + conf.registerExtensionFunction(new ExtractYear()); + conf.registerExtensionFunction(new NormalizeDate()); + conf.registerExtensionFunction(new PickFirst()); - return factory.newTransformer(new StreamSource(new StringReader(xslt))); - } + return factory.newTransformer(new StreamSource(new StringReader(xslt))); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java index b62afb19a0..fc1c382910 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java @@ -1,73 +1,76 @@ + package eu.dnetlib.message; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.Map; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + public class Message { - private String workflowId; + private String workflowId; - private String jobName; + private String jobName; - private MessageType type; + private MessageType type; - private Map body; + private Map body; - public static Message fromJson(final String json) throws IOException { - final ObjectMapper jsonMapper = new ObjectMapper(); - return jsonMapper.readValue(json, Message.class); - } + public static Message fromJson(final String json) throws IOException { + final ObjectMapper jsonMapper = new ObjectMapper(); + return jsonMapper.readValue(json, Message.class); + } - public Message() {} + public Message() { + } - public Message(String workflowId, String jobName, MessageType type, Map body) { - this.workflowId = workflowId; - this.jobName = jobName; - this.type = type; - this.body = body; - } + public Message(String workflowId, String jobName, MessageType type, Map body) { + this.workflowId = workflowId; + this.jobName = jobName; + this.type = type; + this.body = body; + } - public String getWorkflowId() { - return workflowId; - } + public String getWorkflowId() { + return workflowId; + } - public void setWorkflowId(String workflowId) { - this.workflowId = workflowId; - } + public void setWorkflowId(String workflowId) { + this.workflowId = workflowId; + } - public String getJobName() { - return jobName; - } + public String getJobName() { + return jobName; + } - public void setJobName(String jobName) { - this.jobName = jobName; - } + public void setJobName(String jobName) { + this.jobName = jobName; + } - public MessageType getType() { - return type; - } + public MessageType getType() { + return type; + } - public void setType(MessageType type) { - this.type = type; - } + public void setType(MessageType type) { + this.type = type; + } - public Map getBody() { - return body; - } + public Map getBody() { + return body; + } - public void setBody(Map body) { - this.body = body; - } + public void setBody(Map body) { + this.body = body; + } - @Override - public String toString() { - final ObjectMapper jsonMapper = new ObjectMapper(); - try { - return jsonMapper.writeValueAsString(this); - } catch (JsonProcessingException e) { - return null; - } - } + @Override + public String toString() { + final ObjectMapper jsonMapper = new ObjectMapper(); + try { + return jsonMapper.writeValueAsString(this); + } catch (JsonProcessingException e) { + return null; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java index 3df712a624..fb3f0bd956 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java @@ -1,45 +1,47 @@ + package eu.dnetlib.message; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.LinkedBlockingQueue; + import com.rabbitmq.client.AMQP; import com.rabbitmq.client.Channel; import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.Envelope; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.LinkedBlockingQueue; public class MessageConsumer extends DefaultConsumer { - final LinkedBlockingQueue queueMessages; + final LinkedBlockingQueue queueMessages; - /** - * Constructs a new instance and records its association to the passed-in channel. - * - * @param channel the channel to which this consumer is attached - * @param queueMessages - */ - public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { - super(channel); - this.queueMessages = queueMessages; - } + /** + * Constructs a new instance and records its association to the passed-in channel. + * + * @param channel the channel to which this consumer is attached + * @param queueMessages + */ + public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { + super(channel); + this.queueMessages = queueMessages; + } - @Override - public void handleDelivery( - String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) - throws IOException { - final String json = new String(body, StandardCharsets.UTF_8); - Message message = Message.fromJson(json); - try { - this.queueMessages.put(message); - System.out.println("Receiving Message " + message); - } catch (InterruptedException e) { - if (message.getType() == MessageType.REPORT) - throw new RuntimeException("Error on sending message"); - else { - // TODO LOGGING EXCEPTION - } - } finally { - getChannel().basicAck(envelope.getDeliveryTag(), false); - } - } + @Override + public void handleDelivery( + String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) + throws IOException { + final String json = new String(body, StandardCharsets.UTF_8); + Message message = Message.fromJson(json); + try { + this.queueMessages.put(message); + System.out.println("Receiving Message " + message); + } catch (InterruptedException e) { + if (message.getType() == MessageType.REPORT) + throw new RuntimeException("Error on sending message"); + else { + // TODO LOGGING EXCEPTION + } + } finally { + getChannel().basicAck(envelope.getDeliveryTag(), false); + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java index 8370a6cc8e..4c5c48c559 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java @@ -1,134 +1,136 @@ + package eu.dnetlib.message; -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.Connection; -import com.rabbitmq.client.ConnectionFactory; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeoutException; +import com.rabbitmq.client.Channel; +import com.rabbitmq.client.Connection; +import com.rabbitmq.client.ConnectionFactory; + public class MessageManager { - private final String messageHost; + private final String messageHost; - private final String username; + private final String username; - private final String password; + private final String password; - private Connection connection; + private Connection connection; - private Map channels = new HashMap<>(); + private Map channels = new HashMap<>(); - private boolean durable; + private boolean durable; - private boolean autodelete; + private boolean autodelete; - private final LinkedBlockingQueue queueMessages; + private final LinkedBlockingQueue queueMessages; - public MessageManager( - String messageHost, - String username, - String password, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - } + public MessageManager( + String messageHost, + String username, + String password, + final LinkedBlockingQueue queueMessages) { + this.queueMessages = queueMessages; + this.messageHost = messageHost; + this.username = username; + this.password = password; + } - public MessageManager( - String messageHost, - String username, - String password, - boolean durable, - boolean autodelete, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; + public MessageManager( + String messageHost, + String username, + String password, + boolean durable, + boolean autodelete, + final LinkedBlockingQueue queueMessages) { + this.queueMessages = queueMessages; + this.messageHost = messageHost; + this.username = username; + this.password = password; - this.durable = durable; - this.autodelete = autodelete; - } + this.durable = durable; + this.autodelete = autodelete; + } - private Connection createConnection() throws IOException, TimeoutException { - ConnectionFactory factory = new ConnectionFactory(); - factory.setHost(this.messageHost); - factory.setUsername(this.username); - factory.setPassword(this.password); - return factory.newConnection(); - } + private Connection createConnection() throws IOException, TimeoutException { + ConnectionFactory factory = new ConnectionFactory(); + factory.setHost(this.messageHost); + factory.setUsername(this.username); + factory.setPassword(this.password); + return factory.newConnection(); + } - private Channel createChannel( - final Connection connection, - final String queueName, - final boolean durable, - final boolean autodelete) - throws Exception { - Map args = new HashMap<>(); - args.put("x-message-ttl", 10000); - Channel channel = connection.createChannel(); - channel.queueDeclare(queueName, durable, false, this.autodelete, args); - return channel; - } + private Channel createChannel( + final Connection connection, + final String queueName, + final boolean durable, + final boolean autodelete) + throws Exception { + Map args = new HashMap<>(); + args.put("x-message-ttl", 10000); + Channel channel = connection.createChannel(); + channel.queueDeclare(queueName, durable, false, this.autodelete, args); + return channel; + } - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) - throws Exception { - if (channels.containsKey(queueName)) { - return channels.get(queueName); - } + private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) + throws Exception { + if (channels.containsKey(queueName)) { + return channels.get(queueName); + } - if (this.connection == null) { - this.connection = createConnection(); - } - channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); - return channels.get(queueName); - } + if (this.connection == null) { + this.connection = createConnection(); + } + channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); + return channels.get(queueName); + } - public void close() throws IOException { - channels - .values() - .forEach( - ch -> { - try { - ch.close(); - } catch (Exception e) { - // TODO LOG - } - }); + public void close() throws IOException { + channels + .values() + .forEach( + ch -> { + try { + ch.close(); + } catch (Exception e) { + // TODO LOG + } + }); - this.connection.close(); - } + this.connection.close(); + } - public boolean sendMessage(final Message message, String queueName) throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + public boolean sendMessage(final Message message, String queueName) throws Exception { + try { + Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); + channel.basicPublish("", queueName, null, message.toString().getBytes()); + return true; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public boolean sendMessage( - final Message message, String queueName, boolean durable_var, boolean autodelete_var) - throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + public boolean sendMessage( + final Message message, String queueName, boolean durable_var, boolean autodelete_var) + throws Exception { + try { + Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); + channel.basicPublish("", queueName, null, message.toString().getBytes()); + return true; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public void startConsumingMessage( - final String queueName, final boolean durable, final boolean autodelete) throws Exception { + public void startConsumingMessage( + final String queueName, final boolean durable, final boolean autodelete) throws Exception { - Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); - } + Channel channel = createChannel(createConnection(), queueName, durable, autodelete); + channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java index edca900611..72cbda2528 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java @@ -1,6 +1,6 @@ + package eu.dnetlib.message; public enum MessageType { - ONGOING, - REPORT + ONGOING, REPORT } diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java index 1ae6e8eadc..e07fcef66e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java @@ -1,24 +1,25 @@ + package eu.dnetlib.scholexplorer.relation; import java.io.Serializable; public class RelInfo implements Serializable { - private String original; - private String inverse; + private String original; + private String inverse; - public String getOriginal() { - return original; - } + public String getOriginal() { + return original; + } - public void setOriginal(String original) { - this.original = original; - } + public void setOriginal(String original) { + this.original = original; + } - public String getInverse() { - return inverse; - } + public String getInverse() { + return inverse; + } - public void setInverse(String inverse) { - this.inverse = inverse; - } + public void setInverse(String inverse) { + this.inverse = inverse; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java index 9cc995821f..eb708c390c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java @@ -1,18 +1,20 @@ + package eu.dnetlib.scholexplorer.relation; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.Serializable; import java.util.HashMap; + import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class RelationMapper extends HashMap implements Serializable { - public static RelationMapper load() throws Exception { + public static RelationMapper load() throws Exception { - final String json = - IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); + final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); - ObjectMapper mapper = new ObjectMapper(); - return mapper.readValue(json, RelationMapper.class); - } + ObjectMapper mapper = new ObjectMapper(); + return mapper.readValue(json, RelationMapper.class); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index bb7351745d..e140208308 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.application; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -8,58 +9,59 @@ import org.junit.jupiter.api.Test; public class ArgumentApplicationParserTest { - @Test - public void testParseParameter() throws Exception { - final String jsonConfiguration = - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); - assertNotNull(jsonConfiguration); - ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument( - new String[] { - "-p", - "value0", - "-a", - "value1", - "-n", - "value2", - "-u", - "value3", - "-ru", - "value4", - "-rp", - "value5", - "-rh", - "value6", - "-ro", - "value7", - "-rr", - "value8", - "-w", - "value9", - "-cc", - ArgumentApplicationParser.compressArgument(jsonConfiguration) - }); - assertNotNull(parser.get("hdfsPath")); - assertNotNull(parser.get("apidescriptor")); - assertNotNull(parser.get("namenode")); - assertNotNull(parser.get("userHDFS")); - assertNotNull(parser.get("rabbitUser")); - assertNotNull(parser.get("rabbitPassWord")); - assertNotNull(parser.get("rabbitHost")); - assertNotNull(parser.get("rabbitOngoingQueue")); - assertNotNull(parser.get("rabbitReportQueue")); - assertNotNull(parser.get("workflowId")); - assertEquals("value0", parser.get("hdfsPath")); - assertEquals("value1", parser.get("apidescriptor")); - assertEquals("value2", parser.get("namenode")); - assertEquals("value3", parser.get("userHDFS")); - assertEquals("value4", parser.get("rabbitUser")); - assertEquals("value5", parser.get("rabbitPassWord")); - assertEquals("value6", parser.get("rabbitHost")); - assertEquals("value7", parser.get("rabbitOngoingQueue")); - assertEquals("value8", parser.get("rabbitReportQueue")); - assertEquals("value9", parser.get("workflowId")); - assertEquals(jsonConfiguration, parser.get("ccCoco")); - } + @Test + public void testParseParameter() throws Exception { + final String jsonConfiguration = IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); + assertNotNull(jsonConfiguration); + ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser + .parseArgument( + new String[] { + "-p", + "value0", + "-a", + "value1", + "-n", + "value2", + "-u", + "value3", + "-ru", + "value4", + "-rp", + "value5", + "-rh", + "value6", + "-ro", + "value7", + "-rr", + "value8", + "-w", + "value9", + "-cc", + ArgumentApplicationParser.compressArgument(jsonConfiguration) + }); + assertNotNull(parser.get("hdfsPath")); + assertNotNull(parser.get("apidescriptor")); + assertNotNull(parser.get("namenode")); + assertNotNull(parser.get("userHDFS")); + assertNotNull(parser.get("rabbitUser")); + assertNotNull(parser.get("rabbitPassWord")); + assertNotNull(parser.get("rabbitHost")); + assertNotNull(parser.get("rabbitOngoingQueue")); + assertNotNull(parser.get("rabbitReportQueue")); + assertNotNull(parser.get("workflowId")); + assertEquals("value0", parser.get("hdfsPath")); + assertEquals("value1", parser.get("apidescriptor")); + assertEquals("value2", parser.get("namenode")); + assertEquals("value3", parser.get("userHDFS")); + assertEquals("value4", parser.get("rabbitUser")); + assertEquals("value5", parser.get("rabbitPassWord")); + assertEquals("value6", parser.get("rabbitHost")); + assertEquals("value7", parser.get("rabbitOngoingQueue")); + assertEquals("value8", parser.get("rabbitReportQueue")); + assertEquals("value9", parser.get("workflowId")); + assertEquals(jsonConfiguration, parser.get("ccCoco")); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java index a8f0bbb0d0..870943816f 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import static org.junit.jupiter.api.Assertions.*; @@ -8,6 +9,7 @@ import java.nio.file.Path; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -15,63 +17,64 @@ import org.junit.jupiter.api.io.TempDir; public class HdfsSupportTest { - @Nested - class Remove { + @Nested + class Remove { - @Test - public void shouldThrowARuntimeExceptionOnError() { - // when - assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); - } + @Test + public void shouldThrowARuntimeExceptionOnError() { + // when + assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); + } - @Test - public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { - // when - HdfsSupport.remove(tempDir.toString(), new Configuration()); + @Test + public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { + // when + HdfsSupport.remove(tempDir.toString(), new Configuration()); - // then - assertFalse(Files.exists(tempDir)); - } + // then + assertFalse(Files.exists(tempDir)); + } - @Test - public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { - // given - Path file = Files.createTempFile(tempDir, "p", "s"); + @Test + public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { + // given + Path file = Files.createTempFile(tempDir, "p", "s"); - // when - HdfsSupport.remove(file.toString(), new Configuration()); + // when + HdfsSupport.remove(file.toString(), new Configuration()); - // then - assertFalse(Files.exists(file)); - } - } + // then + assertFalse(Files.exists(file)); + } + } - @Nested - class ListFiles { + @Nested + class ListFiles { - @Test - public void shouldThrowARuntimeExceptionOnError() { - // when - assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); - } + @Test + public void shouldThrowARuntimeExceptionOnError() { + // when + assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); + } - @Test - public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { - Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); - Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); + @Test + public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { + Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); + Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); - // when - List paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration()); + // when + List paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration()); - // then - assertEquals(2, paths.size()); - List expecteds = - Arrays.stream(new String[] {subDir1.toString(), subDir2.toString()}) - .sorted() - .collect(Collectors.toList()); - List actuals = paths.stream().sorted().collect(Collectors.toList()); - assertTrue(actuals.get(0).contains(expecteds.get(0))); - assertTrue(actuals.get(1).contains(expecteds.get(1))); - } - } + // then + assertEquals(2, paths.size()); + List expecteds = Arrays.stream(new String[] { + subDir1.toString(), subDir2.toString() + }) + .sorted() + .collect(Collectors.toList()); + List actuals = paths.stream().sorted().collect(Collectors.toList()); + assertTrue(actuals.get(0).contains(expecteds.get(0))); + assertTrue(actuals.get(1).contains(expecteds.get(1))); + } + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java index 698b9cea59..2f01c08631 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java @@ -1,55 +1,58 @@ + package eu.dnetlib.dhp.common; import static org.mockito.Mockito.*; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; + public class SparkSessionSupportTest { - @Nested - class RunWithSparkSession { + @Nested + class RunWithSparkSession { - @Test - public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() - throws Exception { - // given - SparkSession spark = mock(SparkSession.class); - SparkConf conf = mock(SparkConf.class); - Function sparkSessionBuilder = mock(Function.class); - when(sparkSessionBuilder.apply(conf)).thenReturn(spark); - ThrowingConsumer fn = mock(ThrowingConsumer.class); + @Test + public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() + throws Exception { + // given + SparkSession spark = mock(SparkSession.class); + SparkConf conf = mock(SparkConf.class); + Function sparkSessionBuilder = mock(Function.class); + when(sparkSessionBuilder.apply(conf)).thenReturn(spark); + ThrowingConsumer fn = mock(ThrowingConsumer.class); - // when - SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn); + // when + SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn); - // then - verify(sparkSessionBuilder).apply(conf); - verify(fn).accept(spark); - verify(spark, never()).stop(); - } + // then + verify(sparkSessionBuilder).apply(conf); + verify(fn).accept(spark); + verify(spark, never()).stop(); + } - @Test - public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() - throws Exception { - // given - SparkSession spark = mock(SparkSession.class); - SparkConf conf = mock(SparkConf.class); - Function sparkSessionBuilder = mock(Function.class); - when(sparkSessionBuilder.apply(conf)).thenReturn(spark); - ThrowingConsumer fn = mock(ThrowingConsumer.class); + @Test + public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() + throws Exception { + // given + SparkSession spark = mock(SparkSession.class); + SparkConf conf = mock(SparkConf.class); + Function sparkSessionBuilder = mock(Function.class); + when(sparkSessionBuilder.apply(conf)).thenReturn(spark); + ThrowingConsumer fn = mock(ThrowingConsumer.class); - // when - SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn); + // when + SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn); - // then - verify(sparkSessionBuilder).apply(conf); - verify(fn).accept(spark); - verify(spark, times(1)).stop(); - } - } + // then + verify(sparkSessionBuilder).apply(conf); + verify(fn).accept(spark); + verify(spark, times(1)).stop(); + } + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java index 84cb08d953..cb4d0ab500 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.model.mdstore; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -6,10 +7,10 @@ import org.junit.jupiter.api.Test; public class MetadataRecordTest { - @Test - public void getTimestamp() { + @Test + public void getTimestamp() { - MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() > 0); - } + MetadataRecord r = new MetadataRecord(); + assertTrue(r.getDateOfCollection() > 0); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java index a514f8573d..442f7b5c25 100644 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.message; import static org.junit.jupiter.api.Assertions.*; @@ -5,46 +6,46 @@ import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.util.HashMap; import java.util.Map; + import org.junit.jupiter.api.Test; public class MessageTest { - @Test - public void fromJsonTest() throws IOException { - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); + @Test + public void fromJsonTest() throws IOException { + Message m = new Message(); + m.setWorkflowId("wId"); + m.setType(MessageType.ONGOING); + m.setJobName("Collection"); + Map body = new HashMap<>(); + body.put("parsedItem", "300"); + body.put("ExecutionTime", "30s"); - m.setBody(body); - System.out.println("m = " + m); - Message m1 = Message.fromJson(m.toString()); - assertEquals(m1.getWorkflowId(), m.getWorkflowId()); - assertEquals(m1.getType(), m.getType()); - assertEquals(m1.getJobName(), m.getJobName()); + m.setBody(body); + System.out.println("m = " + m); + Message m1 = Message.fromJson(m.toString()); + assertEquals(m1.getWorkflowId(), m.getWorkflowId()); + assertEquals(m1.getType(), m.getType()); + assertEquals(m1.getJobName(), m.getJobName()); - assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); - assertEquals(m1.getJobName(), m.getJobName()); - } + assertNotNull(m1.getBody()); + m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); + assertEquals(m1.getJobName(), m.getJobName()); + } - @Test - public void toStringTest() { - final String expectedJson = - "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); + @Test + public void toStringTest() { + final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; + Message m = new Message(); + m.setWorkflowId("wId"); + m.setType(MessageType.ONGOING); + m.setJobName("Collection"); + Map body = new HashMap<>(); + body.put("parsedItem", "300"); + body.put("ExecutionTime", "30s"); - m.setBody(body); + m.setBody(body); - assertEquals(expectedJson, m.toString()); - } + assertEquals(expectedJson, m.toString()); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java index 9381cb01f8..d1d1ada71a 100644 --- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java @@ -1,13 +1,14 @@ + package eu.dnetlib.scholexplorer.relation; import org.junit.jupiter.api.Test; public class RelationMapperTest { - @Test - public void testLoadRels() throws Exception { + @Test + public void testLoadRels() throws Exception { - RelationMapper relationMapper = RelationMapper.load(); - relationMapper.keySet().forEach(System.out::println); - } + RelationMapper relationMapper = RelationMapper.load(); + relationMapper.keySet().forEach(System.out::println); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java index c803fab521..84b22c81c0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -1,36 +1,40 @@ + package eu.dnetlib.dhp.schema.action; -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.io.Serializable; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + @JsonDeserialize(using = AtomicActionDeserializer.class) public class AtomicAction implements Serializable { - private Class clazz; + private Class clazz; - private T payload; + private T payload; - public AtomicAction() {} + public AtomicAction() { + } - public AtomicAction(Class clazz, T payload) { - this.clazz = clazz; - this.payload = payload; - } + public AtomicAction(Class clazz, T payload) { + this.clazz = clazz; + this.payload = payload; + } - public Class getClazz() { - return clazz; - } + public Class getClazz() { + return clazz; + } - public void setClazz(Class clazz) { - this.clazz = clazz; - } + public void setClazz(Class clazz) { + this.clazz = clazz; + } - public T getPayload() { - return payload; - } + public T getPayload() { + return payload; + } - public void setPayload(T payload) { - this.payload = payload; - } + public void setPayload(T payload) { + this.payload = payload; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java index 701833c42a..a9543d27a1 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -1,29 +1,32 @@ + package eu.dnetlib.dhp.schema.action; +import java.io.IOException; + import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.schema.oaf.Oaf; -import java.io.IOException; public class AtomicActionDeserializer extends JsonDeserializer { - @Override - public Object deserialize(JsonParser jp, DeserializationContext ctxt) - throws IOException, JsonProcessingException { - JsonNode node = jp.getCodec().readTree(jp); - String classTag = node.get("clazz").asText(); - JsonNode payload = node.get("payload"); - ObjectMapper mapper = new ObjectMapper(); + @Override + public Object deserialize(JsonParser jp, DeserializationContext ctxt) + throws IOException, JsonProcessingException { + JsonNode node = jp.getCodec().readTree(jp); + String classTag = node.get("clazz").asText(); + JsonNode payload = node.get("payload"); + ObjectMapper mapper = new ObjectMapper(); - try { - final Class clazz = Class.forName(classTag); - return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } + try { + final Class clazz = Class.forName(classTag); + return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java index d597ecb532..54f30cf336 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java @@ -1,26 +1,21 @@ + package eu.dnetlib.dhp.schema.common; import eu.dnetlib.dhp.schema.oaf.OafEntity; /** Actual entity types in the Graph */ public enum EntityType { - publication, - dataset, - otherresearchproduct, - software, - datasource, - organization, - project; + publication, dataset, otherresearchproduct, software, datasource, organization, project; - /** - * Resolves the EntityType, given the relative class name - * - * @param clazz the given class name - * @param actual OafEntity subclass - * @return the EntityType associated to the given class - */ - public static EntityType fromClass(Class clazz) { + /** + * Resolves the EntityType, given the relative class name + * + * @param clazz the given class name + * @param actual OafEntity subclass + * @return the EntityType associated to the given class + */ + public static EntityType fromClass(Class clazz) { - return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); - } + return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java index 466cdc9e96..cda8ba4842 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java @@ -1,9 +1,7 @@ + package eu.dnetlib.dhp.schema.common; /** Main entity types in the Graph */ public enum MainEntityType { - result, - datasource, - organization, - project + result, datasource, organization, project } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index 0dfdaad528..c6bfff12d4 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,40 +1,41 @@ + package eu.dnetlib.dhp.schema.common; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { - public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; + public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; - public static final String DATASET_RESULTTYPE_CLASSID = "dataset"; - public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication"; - public static final String SOFTWARE_RESULTTYPE_CLASSID = "software"; - public static final String ORP_RESULTTYPE_CLASSID = "other"; + public static final String DATASET_RESULTTYPE_CLASSID = "dataset"; + public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication"; + public static final String SOFTWARE_RESULTTYPE_CLASSID = "software"; + public static final String ORP_RESULTTYPE_CLASSID = "other"; - public static Qualifier PUBLICATION_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier DATASET_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier SOFTWARE_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier ORP_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier PUBLICATION_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier DATASET_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier SOFTWARE_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier ORP_DEFAULT_RESULTTYPE = new Qualifier(); - static { - PUBLICATION_DEFAULT_RESULTTYPE.setClassid(PUBLICATION_RESULTTYPE_CLASSID); - PUBLICATION_DEFAULT_RESULTTYPE.setClassname(PUBLICATION_RESULTTYPE_CLASSID); - PUBLICATION_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - PUBLICATION_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + static { + PUBLICATION_DEFAULT_RESULTTYPE.setClassid(PUBLICATION_RESULTTYPE_CLASSID); + PUBLICATION_DEFAULT_RESULTTYPE.setClassname(PUBLICATION_RESULTTYPE_CLASSID); + PUBLICATION_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + PUBLICATION_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - DATASET_DEFAULT_RESULTTYPE.setClassid(DATASET_RESULTTYPE_CLASSID); - DATASET_DEFAULT_RESULTTYPE.setClassname(DATASET_RESULTTYPE_CLASSID); - DATASET_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - DATASET_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + DATASET_DEFAULT_RESULTTYPE.setClassid(DATASET_RESULTTYPE_CLASSID); + DATASET_DEFAULT_RESULTTYPE.setClassname(DATASET_RESULTTYPE_CLASSID); + DATASET_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + DATASET_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - SOFTWARE_DEFAULT_RESULTTYPE.setClassid(SOFTWARE_RESULTTYPE_CLASSID); - SOFTWARE_DEFAULT_RESULTTYPE.setClassname(SOFTWARE_RESULTTYPE_CLASSID); - SOFTWARE_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - SOFTWARE_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + SOFTWARE_DEFAULT_RESULTTYPE.setClassid(SOFTWARE_RESULTTYPE_CLASSID); + SOFTWARE_DEFAULT_RESULTTYPE.setClassname(SOFTWARE_RESULTTYPE_CLASSID); + SOFTWARE_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + SOFTWARE_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - ORP_DEFAULT_RESULTTYPE.setClassid(ORP_RESULTTYPE_CLASSID); - ORP_DEFAULT_RESULTTYPE.setClassname(ORP_RESULTTYPE_CLASSID); - ORP_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - ORP_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - } + ORP_DEFAULT_RESULTTYPE.setClassid(ORP_RESULTTYPE_CLASSID); + ORP_DEFAULT_RESULTTYPE.setClassname(ORP_RESULTTYPE_CLASSID); + ORP_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + ORP_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index 0054e6d6fd..6f93371ec7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -1,198 +1,205 @@ + package eu.dnetlib.dhp.schema.common; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Map; import java.util.Optional; import java.util.function.Function; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.*; + /** Oaf model utility methods. */ public class ModelSupport { - /** Defines the mapping between the actual entity type and the main entity type */ - private static Map entityMapping = Maps.newHashMap(); + /** Defines the mapping between the actual entity type and the main entity type */ + private static Map entityMapping = Maps.newHashMap(); - static { - entityMapping.put(EntityType.publication, MainEntityType.result); - entityMapping.put(EntityType.dataset, MainEntityType.result); - entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); - entityMapping.put(EntityType.software, MainEntityType.result); - entityMapping.put(EntityType.datasource, MainEntityType.datasource); - entityMapping.put(EntityType.organization, MainEntityType.organization); - entityMapping.put(EntityType.project, MainEntityType.project); - } + static { + entityMapping.put(EntityType.publication, MainEntityType.result); + entityMapping.put(EntityType.dataset, MainEntityType.result); + entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); + entityMapping.put(EntityType.software, MainEntityType.result); + entityMapping.put(EntityType.datasource, MainEntityType.datasource); + entityMapping.put(EntityType.organization, MainEntityType.organization); + entityMapping.put(EntityType.project, MainEntityType.project); + } - /** - * Defines the mapping between the actual entity types and the relative classes implementing them - */ - public static final Map entityTypes = Maps.newHashMap(); + /** + * Defines the mapping between the actual entity types and the relative classes implementing them + */ + public static final Map entityTypes = Maps.newHashMap(); - static { - entityTypes.put(EntityType.datasource, Datasource.class); - entityTypes.put(EntityType.organization, Organization.class); - entityTypes.put(EntityType.project, Project.class); - entityTypes.put(EntityType.dataset, Dataset.class); - entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class); - entityTypes.put(EntityType.software, Software.class); - entityTypes.put(EntityType.publication, Publication.class); - } + static { + entityTypes.put(EntityType.datasource, Datasource.class); + entityTypes.put(EntityType.organization, Organization.class); + entityTypes.put(EntityType.project, Project.class); + entityTypes.put(EntityType.dataset, Dataset.class); + entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class); + entityTypes.put(EntityType.software, Software.class); + entityTypes.put(EntityType.publication, Publication.class); + } - public static final Map oafTypes = Maps.newHashMap(); + public static final Map oafTypes = Maps.newHashMap(); - static { - oafTypes.put("datasource", Datasource.class); - oafTypes.put("organization", Organization.class); - oafTypes.put("project", Project.class); - oafTypes.put("dataset", Dataset.class); - oafTypes.put("otherresearchproduct", OtherResearchProduct.class); - oafTypes.put("software", Software.class); - oafTypes.put("publication", Publication.class); - oafTypes.put("relation", Relation.class); - } + static { + oafTypes.put("datasource", Datasource.class); + oafTypes.put("organization", Organization.class); + oafTypes.put("project", Project.class); + oafTypes.put("dataset", Dataset.class); + oafTypes.put("otherresearchproduct", OtherResearchProduct.class); + oafTypes.put("software", Software.class); + oafTypes.put("publication", Publication.class); + oafTypes.put("relation", Relation.class); + } - private static final String schemeTemplate = "dnet:%s_%s_relations"; + private static final String schemeTemplate = "dnet:%s_%s_relations"; - private ModelSupport() {} + private ModelSupport() { + } - /** - * Checks subclass-superclass relationship. - * - * @param subClazzObject Subclass object instance - * @param superClazzObject Superclass object instance - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - X subClazzObject, Y superClazzObject) { - return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); - } + /** + * Checks subclass-superclass relationship. + * + * @param subClazzObject Subclass object instance + * @param superClazzObject Superclass object instance + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + X subClazzObject, Y superClazzObject) { + return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); + } - /** - * Checks subclass-superclass relationship. - * - * @param subClazzObject Subclass object instance - * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - X subClazzObject, Class superClazz) { - return isSubClass(subClazzObject.getClass(), superClazz); - } + /** + * Checks subclass-superclass relationship. + * + * @param subClazzObject Subclass object instance + * @param superClazz Superclass class + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + X subClazzObject, Class superClazz) { + return isSubClass(subClazzObject.getClass(), superClazz); + } - /** - * Checks subclass-superclass relationship. - * - * @param subClazz Subclass class - * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - Class subClazz, Class superClazz) { - return superClazz.isAssignableFrom(subClazz); - } + /** + * Checks subclass-superclass relationship. + * + * @param subClazz Subclass class + * @param superClazz Superclass class + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + Class subClazz, Class superClazz) { + return superClazz.isAssignableFrom(subClazz); + } - /** - * Lists all the OAF model classes - * - * @param - * @return - */ - public static Class[] getOafModelClasses() { - return new Class[] { - Author.class, - Context.class, - Country.class, - DataInfo.class, - Dataset.class, - Datasource.class, - ExternalReference.class, - ExtraInfo.class, - Field.class, - GeoLocation.class, - Instance.class, - Journal.class, - KeyValue.class, - Oaf.class, - OafEntity.class, - OAIProvenance.class, - Organization.class, - OriginDescription.class, - OtherResearchProduct.class, - Project.class, - Publication.class, - Qualifier.class, - Relation.class, - Result.class, - Software.class, - StructuredProperty.class - }; - } + /** + * Lists all the OAF model classes + * + * @param + * @return + */ + public static Class[] getOafModelClasses() { + return new Class[] { + Author.class, + Context.class, + Country.class, + DataInfo.class, + Dataset.class, + Datasource.class, + ExternalReference.class, + ExtraInfo.class, + Field.class, + GeoLocation.class, + Instance.class, + Journal.class, + KeyValue.class, + Oaf.class, + OafEntity.class, + OAIProvenance.class, + Organization.class, + OriginDescription.class, + OtherResearchProduct.class, + Project.class, + Publication.class, + Qualifier.class, + Relation.class, + Result.class, + Software.class, + StructuredProperty.class + }; + } - public static String getMainType(final EntityType type) { - return entityMapping.get(type).name(); - } + public static String getMainType(final EntityType type) { + return entityMapping.get(type).name(); + } - public static boolean isResult(EntityType type) { - return MainEntityType.result.name().equals(getMainType(type)); - } + public static boolean isResult(EntityType type) { + return MainEntityType.result.name().equals(getMainType(type)); + } - public static String getScheme(final String sourceType, final String targetType) { - return String.format( - schemeTemplate, - entityMapping.get(EntityType.valueOf(sourceType)).name(), - entityMapping.get(EntityType.valueOf(targetType)).name()); - } + public static String getScheme(final String sourceType, final String targetType) { + return String + .format( + schemeTemplate, + entityMapping.get(EntityType.valueOf(sourceType)).name(), + entityMapping.get(EntityType.valueOf(targetType)).name()); + } - public static Function idFn() { - return x -> { - if (isSubClass(x, Relation.class)) { - return idFnForRelation(x); - } - return idFnForOafEntity(x); - }; - } + public static Function idFn() { + return x -> { + if (isSubClass(x, Relation.class)) { + return idFnForRelation(x); + } + return idFnForOafEntity(x); + }; + } - private static String idFnForRelation(T t) { - Relation r = (Relation) t; - return Optional.ofNullable(r.getSource()) - .map( - source -> - Optional.ofNullable(r.getTarget()) - .map( - target -> - Optional.ofNullable(r.getRelType()) - .map( - relType -> - Optional.ofNullable(r.getSubRelType()) - .map( - subRelType -> - Optional.ofNullable(r.getRelClass()) - .map( - relClass -> - String.join( - source, - target, - relType, - subRelType, - relClass)) - .orElse( - String.join( - source, - target, - relType, - subRelType))) - .orElse(String.join(source, target, relType))) - .orElse(String.join(source, target))) - .orElse(source)) - .orElse(null); - } + private static String idFnForRelation(T t) { + Relation r = (Relation) t; + return Optional + .ofNullable(r.getSource()) + .map( + source -> Optional + .ofNullable(r.getTarget()) + .map( + target -> Optional + .ofNullable(r.getRelType()) + .map( + relType -> Optional + .ofNullable(r.getSubRelType()) + .map( + subRelType -> Optional + .ofNullable(r.getRelClass()) + .map( + relClass -> String + .join( + source, + target, + relType, + subRelType, + relClass)) + .orElse( + String + .join( + source, + target, + relType, + subRelType))) + .orElse(String.join(source, target, relType))) + .orElse(String.join(source, target))) + .orElse(source)) + .orElse(null); + } - private static String idFnForOafEntity(T t) { - return ((OafEntity) t).getId(); - } + private static String idFnForOafEntity(T t) { + return ((OafEntity) t).getId(); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java index b4219290fc..b9bd4c5f07 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,81 +7,83 @@ import java.util.Objects; public class Author implements Serializable { - private String fullname; + private String fullname; - private String name; + private String name; - private String surname; + private String surname; - private Integer rank; + private Integer rank; - private List pid; + private List pid; - private List> affiliation; + private List> affiliation; - public String getFullname() { - return fullname; - } + public String getFullname() { + return fullname; + } - public void setFullname(String fullname) { - this.fullname = fullname; - } + public void setFullname(String fullname) { + this.fullname = fullname; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSurname() { - return surname; - } + public String getSurname() { + return surname; + } - public void setSurname(String surname) { - this.surname = surname; - } + public void setSurname(String surname) { + this.surname = surname; + } - public Integer getRank() { - return rank; - } + public Integer getRank() { + return rank; + } - public void setRank(Integer rank) { - this.rank = rank; - } + public void setRank(Integer rank) { + this.rank = rank; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public List> getAffiliation() { - return affiliation; - } + public List> getAffiliation() { + return affiliation; + } - public void setAffiliation(List> affiliation) { - this.affiliation = affiliation; - } + public void setAffiliation(List> affiliation) { + this.affiliation = affiliation; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Author author = (Author) o; - return Objects.equals(fullname, author.fullname) - && Objects.equals(name, author.name) - && Objects.equals(surname, author.surname) - && Objects.equals(rank, author.rank) - && Objects.equals(pid, author.pid) - && Objects.equals(affiliation, author.affiliation); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Author author = (Author) o; + return Objects.equals(fullname, author.fullname) + && Objects.equals(name, author.name) + && Objects.equals(surname, author.surname) + && Objects.equals(rank, author.rank) + && Objects.equals(pid, author.pid) + && Objects.equals(affiliation, author.affiliation); + } - @Override - public int hashCode() { - return Objects.hash(fullname, name, surname, rank, pid, affiliation); - } + @Override + public int hashCode() { + return Objects.hash(fullname, name, surname, rank, pid, affiliation); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java index 7d930630d3..57912c4639 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java @@ -1,42 +1,46 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.List; public class Context implements Serializable { - private String id; + private String id; - private List dataInfo; + private List dataInfo; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getDataInfo() { - return dataInfo; - } + public List getDataInfo() { + return dataInfo; + } - public void setDataInfo(List dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(List dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public int hashCode() { - return id == null ? 0 : id.hashCode(); - } + @Override + public int hashCode() { + return id == null ? 0 : id.hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Context other = (Context) obj; + Context other = (Context) obj; - return id.equals(other.getId()); - } + return id.equals(other.getId()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java index 388b9aab65..e25fdcade6 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java @@ -1,30 +1,34 @@ + package eu.dnetlib.dhp.schema.oaf; import java.util.Objects; public class Country extends Qualifier { - private DataInfo dataInfo; + private DataInfo dataInfo; - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - if (!super.equals(o)) return false; - Country country = (Country) o; - return Objects.equals(dataInfo, country.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + if (!super.equals(o)) + return false; + Country country = (Country) o; + return Objects.equals(dataInfo, country.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), dataInfo); - } + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java index f65518a1f6..cc77e1ea0c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,77 +6,80 @@ import java.util.Objects; public class DataInfo implements Serializable { - private Boolean invisible = false; - private Boolean inferred; - private Boolean deletedbyinference; - private String trust; - private String inferenceprovenance; - private Qualifier provenanceaction; + private Boolean invisible = false; + private Boolean inferred; + private Boolean deletedbyinference; + private String trust; + private String inferenceprovenance; + private Qualifier provenanceaction; - public Boolean getInvisible() { - return invisible; - } + public Boolean getInvisible() { + return invisible; + } - public void setInvisible(Boolean invisible) { - this.invisible = invisible; - } + public void setInvisible(Boolean invisible) { + this.invisible = invisible; + } - public Boolean getInferred() { - return inferred; - } + public Boolean getInferred() { + return inferred; + } - public void setInferred(Boolean inferred) { - this.inferred = inferred; - } + public void setInferred(Boolean inferred) { + this.inferred = inferred; + } - public Boolean getDeletedbyinference() { - return deletedbyinference; - } + public Boolean getDeletedbyinference() { + return deletedbyinference; + } - public void setDeletedbyinference(Boolean deletedbyinference) { - this.deletedbyinference = deletedbyinference; - } + public void setDeletedbyinference(Boolean deletedbyinference) { + this.deletedbyinference = deletedbyinference; + } - public String getTrust() { - return trust; - } + public String getTrust() { + return trust; + } - public void setTrust(String trust) { - this.trust = trust; - } + public void setTrust(String trust) { + this.trust = trust; + } - public String getInferenceprovenance() { - return inferenceprovenance; - } + public String getInferenceprovenance() { + return inferenceprovenance; + } - public void setInferenceprovenance(String inferenceprovenance) { - this.inferenceprovenance = inferenceprovenance; - } + public void setInferenceprovenance(String inferenceprovenance) { + this.inferenceprovenance = inferenceprovenance; + } - public Qualifier getProvenanceaction() { - return provenanceaction; - } + public Qualifier getProvenanceaction() { + return provenanceaction; + } - public void setProvenanceaction(Qualifier provenanceaction) { - this.provenanceaction = provenanceaction; - } + public void setProvenanceaction(Qualifier provenanceaction) { + this.provenanceaction = provenanceaction; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - DataInfo dataInfo = (DataInfo) o; - return Objects.equals(invisible, dataInfo.invisible) - && Objects.equals(inferred, dataInfo.inferred) - && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) - && Objects.equals(trust, dataInfo.trust) - && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) - && Objects.equals(provenanceaction, dataInfo.provenanceaction); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + DataInfo dataInfo = (DataInfo) o; + return Objects.equals(invisible, dataInfo.invisible) + && Objects.equals(inferred, dataInfo.inferred) + && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) + && Objects.equals(trust, dataInfo.trust) + && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) + && Objects.equals(provenanceaction, dataInfo.provenanceaction); + } - @Override - public int hashCode() { - return Objects.hash( - invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); - } + @Override + public int hashCode() { + return Objects + .hash( + invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java index 93b51f3522..07ddbb00e9 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java @@ -1,116 +1,115 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Dataset extends Result implements Serializable { - private Field storagedate; + private Field storagedate; - private Field device; + private Field device; - private Field size; + private Field size; - private Field version; + private Field version; - private Field lastmetadataupdate; + private Field lastmetadataupdate; - private Field metadataversionnumber; + private Field metadataversionnumber; - private List geolocation; + private List geolocation; - public Dataset() { - setResulttype(ModelConstants.DATASET_DEFAULT_RESULTTYPE); - } + public Dataset() { + setResulttype(ModelConstants.DATASET_DEFAULT_RESULTTYPE); + } - public Field getStoragedate() { - return storagedate; - } + public Field getStoragedate() { + return storagedate; + } - public void setStoragedate(Field storagedate) { - this.storagedate = storagedate; - } + public void setStoragedate(Field storagedate) { + this.storagedate = storagedate; + } - public Field getDevice() { - return device; - } + public Field getDevice() { + return device; + } - public void setDevice(Field device) { - this.device = device; - } + public void setDevice(Field device) { + this.device = device; + } - public Field getSize() { - return size; - } + public Field getSize() { + return size; + } - public void setSize(Field size) { - this.size = size; - } + public void setSize(Field size) { + this.size = size; + } - public Field getVersion() { - return version; - } + public Field getVersion() { + return version; + } - public void setVersion(Field version) { - this.version = version; - } + public void setVersion(Field version) { + this.version = version; + } - public Field getLastmetadataupdate() { - return lastmetadataupdate; - } + public Field getLastmetadataupdate() { + return lastmetadataupdate; + } - public void setLastmetadataupdate(Field lastmetadataupdate) { - this.lastmetadataupdate = lastmetadataupdate; - } + public void setLastmetadataupdate(Field lastmetadataupdate) { + this.lastmetadataupdate = lastmetadataupdate; + } - public Field getMetadataversionnumber() { - return metadataversionnumber; - } + public Field getMetadataversionnumber() { + return metadataversionnumber; + } - public void setMetadataversionnumber(Field metadataversionnumber) { - this.metadataversionnumber = metadataversionnumber; - } + public void setMetadataversionnumber(Field metadataversionnumber) { + this.metadataversionnumber = metadataversionnumber; + } - public List getGeolocation() { - return geolocation; - } + public List getGeolocation() { + return geolocation; + } - public void setGeolocation(List geolocation) { - this.geolocation = geolocation; - } + public void setGeolocation(List geolocation) { + this.geolocation = geolocation; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Dataset.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Dataset.class.isAssignableFrom(e.getClass())) { + return; + } - final Dataset d = (Dataset) e; + final Dataset d = (Dataset) e; - storagedate = - d.getStoragedate() != null && compareTrust(this, e) < 0 ? d.getStoragedate() : storagedate; + storagedate = d.getStoragedate() != null && compareTrust(this, e) < 0 ? d.getStoragedate() : storagedate; - device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; + device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; - size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; + size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; - version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; + version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; - lastmetadataupdate = - d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 - ? d.getLastmetadataupdate() - : lastmetadataupdate; + lastmetadataupdate = d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 + ? d.getLastmetadataupdate() + : lastmetadataupdate; - metadataversionnumber = - d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 - ? d.getMetadataversionnumber() - : metadataversionnumber; + metadataversionnumber = d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 + ? d.getMetadataversionnumber() + : metadataversionnumber; - geolocation = mergeLists(geolocation, d.getGeolocation()); + geolocation = mergeLists(geolocation, d.getGeolocation()); - mergeOAFDataInfo(d); - } + mergeOAFDataInfo(d); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java index f0c7976312..721798206b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,494 +6,467 @@ import java.util.List; public class Datasource extends OafEntity implements Serializable { - private Qualifier datasourcetype; + private Qualifier datasourcetype; - private Qualifier openairecompatibility; + private Qualifier openairecompatibility; - private Field officialname; + private Field officialname; - private Field englishname; + private Field englishname; - private Field websiteurl; + private Field websiteurl; - private Field logourl; + private Field logourl; - private Field contactemail; + private Field contactemail; - private Field namespaceprefix; + private Field namespaceprefix; - private Field latitude; + private Field latitude; - private Field longitude; + private Field longitude; - private Field dateofvalidation; + private Field dateofvalidation; - private Field description; + private Field description; - private List subjects; + private List subjects; - // opendoar specific fields (od*) - private Field odnumberofitems; + // opendoar specific fields (od*) + private Field odnumberofitems; - private Field odnumberofitemsdate; + private Field odnumberofitemsdate; - private Field odpolicies; + private Field odpolicies; - private List> odlanguages; + private List> odlanguages; - private List> odcontenttypes; + private List> odcontenttypes; - private List> accessinfopackage; + private List> accessinfopackage; - // re3data fields - private Field releasestartdate; + // re3data fields + private Field releasestartdate; - private Field releaseenddate; + private Field releaseenddate; - private Field missionstatementurl; + private Field missionstatementurl; - private Field dataprovider; + private Field dataprovider; - private Field serviceprovider; + private Field serviceprovider; - // {open, restricted or closed} - private Field databaseaccesstype; + // {open, restricted or closed} + private Field databaseaccesstype; - // {open, restricted or closed} - private Field datauploadtype; + // {open, restricted or closed} + private Field datauploadtype; - // {feeRequired, registration, other} - private Field databaseaccessrestriction; + // {feeRequired, registration, other} + private Field databaseaccessrestriction; - // {feeRequired, registration, other} - private Field datauploadrestriction; + // {feeRequired, registration, other} + private Field datauploadrestriction; - private Field versioning; + private Field versioning; - private Field citationguidelineurl; + private Field citationguidelineurl; - // {yes, no, uknown} - private Field qualitymanagementkind; + // {yes, no, uknown} + private Field qualitymanagementkind; - private Field pidsystems; + private Field pidsystems; - private Field certificates; + private Field certificates; - private List policies; + private List policies; - private Journal journal; + private Journal journal; - public Qualifier getDatasourcetype() { - return datasourcetype; - } + public Qualifier getDatasourcetype() { + return datasourcetype; + } - public void setDatasourcetype(Qualifier datasourcetype) { - this.datasourcetype = datasourcetype; - } + public void setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + } - public Qualifier getOpenairecompatibility() { - return openairecompatibility; - } + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } - public void setOpenairecompatibility(Qualifier openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } + public void setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + } - public Field getOfficialname() { - return officialname; - } + public Field getOfficialname() { + return officialname; + } - public void setOfficialname(Field officialname) { - this.officialname = officialname; - } + public void setOfficialname(Field officialname) { + this.officialname = officialname; + } - public Field getEnglishname() { - return englishname; - } + public Field getEnglishname() { + return englishname; + } - public void setEnglishname(Field englishname) { - this.englishname = englishname; - } + public void setEnglishname(Field englishname) { + this.englishname = englishname; + } - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getLogourl() { - return logourl; - } + public Field getLogourl() { + return logourl; + } - public void setLogourl(Field logourl) { - this.logourl = logourl; - } + public void setLogourl(Field logourl) { + this.logourl = logourl; + } - public Field getContactemail() { - return contactemail; - } + public Field getContactemail() { + return contactemail; + } - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } + public void setContactemail(Field contactemail) { + this.contactemail = contactemail; + } - public Field getNamespaceprefix() { - return namespaceprefix; - } + public Field getNamespaceprefix() { + return namespaceprefix; + } - public void setNamespaceprefix(Field namespaceprefix) { - this.namespaceprefix = namespaceprefix; - } + public void setNamespaceprefix(Field namespaceprefix) { + this.namespaceprefix = namespaceprefix; + } - public Field getLatitude() { - return latitude; - } + public Field getLatitude() { + return latitude; + } - public void setLatitude(Field latitude) { - this.latitude = latitude; - } + public void setLatitude(Field latitude) { + this.latitude = latitude; + } - public Field getLongitude() { - return longitude; - } + public Field getLongitude() { + return longitude; + } - public void setLongitude(Field longitude) { - this.longitude = longitude; - } + public void setLongitude(Field longitude) { + this.longitude = longitude; + } - public Field getDateofvalidation() { - return dateofvalidation; - } + public Field getDateofvalidation() { + return dateofvalidation; + } - public void setDateofvalidation(Field dateofvalidation) { - this.dateofvalidation = dateofvalidation; - } + public void setDateofvalidation(Field dateofvalidation) { + this.dateofvalidation = dateofvalidation; + } - public Field getDescription() { - return description; - } + public Field getDescription() { + return description; + } - public void setDescription(Field description) { - this.description = description; - } + public void setDescription(Field description) { + this.description = description; + } - public List getSubjects() { - return subjects; - } + public List getSubjects() { + return subjects; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public Field getOdnumberofitems() { - return odnumberofitems; - } + public Field getOdnumberofitems() { + return odnumberofitems; + } - public void setOdnumberofitems(Field odnumberofitems) { - this.odnumberofitems = odnumberofitems; - } + public void setOdnumberofitems(Field odnumberofitems) { + this.odnumberofitems = odnumberofitems; + } - public Field getOdnumberofitemsdate() { - return odnumberofitemsdate; - } + public Field getOdnumberofitemsdate() { + return odnumberofitemsdate; + } - public void setOdnumberofitemsdate(Field odnumberofitemsdate) { - this.odnumberofitemsdate = odnumberofitemsdate; - } + public void setOdnumberofitemsdate(Field odnumberofitemsdate) { + this.odnumberofitemsdate = odnumberofitemsdate; + } - public Field getOdpolicies() { - return odpolicies; - } + public Field getOdpolicies() { + return odpolicies; + } - public void setOdpolicies(Field odpolicies) { - this.odpolicies = odpolicies; - } + public void setOdpolicies(Field odpolicies) { + this.odpolicies = odpolicies; + } - public List> getOdlanguages() { - return odlanguages; - } + public List> getOdlanguages() { + return odlanguages; + } - public void setOdlanguages(List> odlanguages) { - this.odlanguages = odlanguages; - } + public void setOdlanguages(List> odlanguages) { + this.odlanguages = odlanguages; + } - public List> getOdcontenttypes() { - return odcontenttypes; - } + public List> getOdcontenttypes() { + return odcontenttypes; + } - public void setOdcontenttypes(List> odcontenttypes) { - this.odcontenttypes = odcontenttypes; - } + public void setOdcontenttypes(List> odcontenttypes) { + this.odcontenttypes = odcontenttypes; + } - public List> getAccessinfopackage() { - return accessinfopackage; - } + public List> getAccessinfopackage() { + return accessinfopackage; + } - public void setAccessinfopackage(List> accessinfopackage) { - this.accessinfopackage = accessinfopackage; - } + public void setAccessinfopackage(List> accessinfopackage) { + this.accessinfopackage = accessinfopackage; + } - public Field getReleasestartdate() { - return releasestartdate; - } + public Field getReleasestartdate() { + return releasestartdate; + } - public void setReleasestartdate(Field releasestartdate) { - this.releasestartdate = releasestartdate; - } + public void setReleasestartdate(Field releasestartdate) { + this.releasestartdate = releasestartdate; + } - public Field getReleaseenddate() { - return releaseenddate; - } + public Field getReleaseenddate() { + return releaseenddate; + } - public void setReleaseenddate(Field releaseenddate) { - this.releaseenddate = releaseenddate; - } + public void setReleaseenddate(Field releaseenddate) { + this.releaseenddate = releaseenddate; + } - public Field getMissionstatementurl() { - return missionstatementurl; - } + public Field getMissionstatementurl() { + return missionstatementurl; + } - public void setMissionstatementurl(Field missionstatementurl) { - this.missionstatementurl = missionstatementurl; - } + public void setMissionstatementurl(Field missionstatementurl) { + this.missionstatementurl = missionstatementurl; + } - public Field getDataprovider() { - return dataprovider; - } + public Field getDataprovider() { + return dataprovider; + } - public void setDataprovider(Field dataprovider) { - this.dataprovider = dataprovider; - } + public void setDataprovider(Field dataprovider) { + this.dataprovider = dataprovider; + } - public Field getServiceprovider() { - return serviceprovider; - } + public Field getServiceprovider() { + return serviceprovider; + } - public void setServiceprovider(Field serviceprovider) { - this.serviceprovider = serviceprovider; - } + public void setServiceprovider(Field serviceprovider) { + this.serviceprovider = serviceprovider; + } - public Field getDatabaseaccesstype() { - return databaseaccesstype; - } + public Field getDatabaseaccesstype() { + return databaseaccesstype; + } - public void setDatabaseaccesstype(Field databaseaccesstype) { - this.databaseaccesstype = databaseaccesstype; - } - - public Field getDatauploadtype() { - return datauploadtype; - } - - public void setDatauploadtype(Field datauploadtype) { - this.datauploadtype = datauploadtype; - } - - public Field getDatabaseaccessrestriction() { - return databaseaccessrestriction; - } - - public void setDatabaseaccessrestriction(Field databaseaccessrestriction) { - this.databaseaccessrestriction = databaseaccessrestriction; - } - - public Field getDatauploadrestriction() { - return datauploadrestriction; - } - - public void setDatauploadrestriction(Field datauploadrestriction) { - this.datauploadrestriction = datauploadrestriction; - } - - public Field getVersioning() { - return versioning; - } - - public void setVersioning(Field versioning) { - this.versioning = versioning; - } - - public Field getCitationguidelineurl() { - return citationguidelineurl; - } - - public void setCitationguidelineurl(Field citationguidelineurl) { - this.citationguidelineurl = citationguidelineurl; - } - - public Field getQualitymanagementkind() { - return qualitymanagementkind; - } - - public void setQualitymanagementkind(Field qualitymanagementkind) { - this.qualitymanagementkind = qualitymanagementkind; - } - - public Field getPidsystems() { - return pidsystems; - } - - public void setPidsystems(Field pidsystems) { - this.pidsystems = pidsystems; - } - - public Field getCertificates() { - return certificates; - } - - public void setCertificates(Field certificates) { - this.certificates = certificates; - } - - public List getPolicies() { - return policies; - } - - public void setPolicies(List policies) { - this.policies = policies; - } - - public Journal getJournal() { - return journal; - } - - public void setJournal(Journal journal) { - this.journal = journal; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Datasource.class.isAssignableFrom(e.getClass())) { - return; - } - - Datasource d = (Datasource) e; - - datasourcetype = - d.getDatasourcetype() != null && compareTrust(this, e) < 0 - ? d.getDatasourcetype() - : datasourcetype; - openairecompatibility = - d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 - ? d.getOpenairecompatibility() - : openairecompatibility; - officialname = - d.getOfficialname() != null && compareTrust(this, e) < 0 - ? d.getOfficialname() - : officialname; - englishname = - d.getEnglishname() != null && compareTrust(this, e) < 0 ? d.getEnglishname() : officialname; - websiteurl = - d.getWebsiteurl() != null && compareTrust(this, e) < 0 ? d.getWebsiteurl() : websiteurl; - logourl = d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); - contactemail = - d.getContactemail() != null && compareTrust(this, e) < 0 - ? d.getContactemail() - : contactemail; - namespaceprefix = - d.getNamespaceprefix() != null && compareTrust(this, e) < 0 - ? d.getNamespaceprefix() - : namespaceprefix; - latitude = d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; - longitude = - d.getLongitude() != null && compareTrust(this, e) < 0 ? d.getLongitude() : longitude; - dateofvalidation = - d.getDateofvalidation() != null && compareTrust(this, e) < 0 - ? d.getDateofvalidation() - : dateofvalidation; - description = - d.getDescription() != null && compareTrust(this, e) < 0 ? d.getDescription() : description; - subjects = mergeLists(subjects, d.getSubjects()); - - // opendoar specific fields (od*) - odnumberofitems = - d.getOdnumberofitems() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitems() - : odnumberofitems; - odnumberofitemsdate = - d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitemsdate() - : odnumberofitemsdate; - odpolicies = - d.getOdpolicies() != null && compareTrust(this, e) < 0 ? d.getOdpolicies() : odpolicies; - odlanguages = mergeLists(odlanguages, d.getOdlanguages()); - odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); - accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); - - // re3data fields - releasestartdate = - d.getReleasestartdate() != null && compareTrust(this, e) < 0 - ? d.getReleasestartdate() - : releasestartdate; - releaseenddate = - d.getReleaseenddate() != null && compareTrust(this, e) < 0 - ? d.getReleaseenddate() - : releaseenddate; - missionstatementurl = - d.getMissionstatementurl() != null && compareTrust(this, e) < 0 - ? d.getMissionstatementurl() - : missionstatementurl; - dataprovider = - d.getDataprovider() != null && compareTrust(this, e) < 0 - ? d.getDataprovider() - : dataprovider; - serviceprovider = - d.getServiceprovider() != null && compareTrust(this, e) < 0 - ? d.getServiceprovider() - : serviceprovider; - - // {open, restricted or closed} - databaseaccesstype = - d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccesstype() - : databaseaccesstype; - - // {open, restricted or closed} - datauploadtype = - d.getDatauploadtype() != null && compareTrust(this, e) < 0 - ? d.getDatauploadtype() - : datauploadtype; - - // {feeRequired, registration, other} - databaseaccessrestriction = - d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccessrestriction() - : databaseaccessrestriction; - - // {feeRequired, registration, other} - datauploadrestriction = - d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatauploadrestriction() - : datauploadrestriction; - - versioning = - d.getVersioning() != null && compareTrust(this, e) < 0 ? d.getVersioning() : versioning; - citationguidelineurl = - d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 - ? d.getCitationguidelineurl() - : citationguidelineurl; - - // {yes, no, unknown} - qualitymanagementkind = - d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 - ? d.getQualitymanagementkind() - : qualitymanagementkind; - pidsystems = - d.getPidsystems() != null && compareTrust(this, e) < 0 ? d.getPidsystems() : pidsystems; - - certificates = - d.getCertificates() != null && compareTrust(this, e) < 0 - ? d.getCertificates() - : certificates; - - policies = mergeLists(policies, d.getPolicies()); - - journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; - - mergeOAFDataInfo(e); - } + public void setDatabaseaccesstype(Field databaseaccesstype) { + this.databaseaccesstype = databaseaccesstype; + } + + public Field getDatauploadtype() { + return datauploadtype; + } + + public void setDatauploadtype(Field datauploadtype) { + this.datauploadtype = datauploadtype; + } + + public Field getDatabaseaccessrestriction() { + return databaseaccessrestriction; + } + + public void setDatabaseaccessrestriction(Field databaseaccessrestriction) { + this.databaseaccessrestriction = databaseaccessrestriction; + } + + public Field getDatauploadrestriction() { + return datauploadrestriction; + } + + public void setDatauploadrestriction(Field datauploadrestriction) { + this.datauploadrestriction = datauploadrestriction; + } + + public Field getVersioning() { + return versioning; + } + + public void setVersioning(Field versioning) { + this.versioning = versioning; + } + + public Field getCitationguidelineurl() { + return citationguidelineurl; + } + + public void setCitationguidelineurl(Field citationguidelineurl) { + this.citationguidelineurl = citationguidelineurl; + } + + public Field getQualitymanagementkind() { + return qualitymanagementkind; + } + + public void setQualitymanagementkind(Field qualitymanagementkind) { + this.qualitymanagementkind = qualitymanagementkind; + } + + public Field getPidsystems() { + return pidsystems; + } + + public void setPidsystems(Field pidsystems) { + this.pidsystems = pidsystems; + } + + public Field getCertificates() { + return certificates; + } + + public void setCertificates(Field certificates) { + this.certificates = certificates; + } + + public List getPolicies() { + return policies; + } + + public void setPolicies(List policies) { + this.policies = policies; + } + + public Journal getJournal() { + return journal; + } + + public void setJournal(Journal journal) { + this.journal = journal; + } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + if (!Datasource.class.isAssignableFrom(e.getClass())) { + return; + } + + Datasource d = (Datasource) e; + + datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e) < 0 + ? d.getDatasourcetype() + : datasourcetype; + openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 + ? d.getOpenairecompatibility() + : openairecompatibility; + officialname = d.getOfficialname() != null && compareTrust(this, e) < 0 + ? d.getOfficialname() + : officialname; + englishname = d.getEnglishname() != null && compareTrust(this, e) < 0 ? d.getEnglishname() : officialname; + websiteurl = d.getWebsiteurl() != null && compareTrust(this, e) < 0 ? d.getWebsiteurl() : websiteurl; + logourl = d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); + contactemail = d.getContactemail() != null && compareTrust(this, e) < 0 + ? d.getContactemail() + : contactemail; + namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e) < 0 + ? d.getNamespaceprefix() + : namespaceprefix; + latitude = d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; + longitude = d.getLongitude() != null && compareTrust(this, e) < 0 ? d.getLongitude() : longitude; + dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e) < 0 + ? d.getDateofvalidation() + : dateofvalidation; + description = d.getDescription() != null && compareTrust(this, e) < 0 ? d.getDescription() : description; + subjects = mergeLists(subjects, d.getSubjects()); + + // opendoar specific fields (od*) + odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitems() + : odnumberofitems; + odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitemsdate() + : odnumberofitemsdate; + odpolicies = d.getOdpolicies() != null && compareTrust(this, e) < 0 ? d.getOdpolicies() : odpolicies; + odlanguages = mergeLists(odlanguages, d.getOdlanguages()); + odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); + accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); + + // re3data fields + releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e) < 0 + ? d.getReleasestartdate() + : releasestartdate; + releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e) < 0 + ? d.getReleaseenddate() + : releaseenddate; + missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e) < 0 + ? d.getMissionstatementurl() + : missionstatementurl; + dataprovider = d.getDataprovider() != null && compareTrust(this, e) < 0 + ? d.getDataprovider() + : dataprovider; + serviceprovider = d.getServiceprovider() != null && compareTrust(this, e) < 0 + ? d.getServiceprovider() + : serviceprovider; + + // {open, restricted or closed} + databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccesstype() + : databaseaccesstype; + + // {open, restricted or closed} + datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e) < 0 + ? d.getDatauploadtype() + : datauploadtype; + + // {feeRequired, registration, other} + databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccessrestriction() + : databaseaccessrestriction; + + // {feeRequired, registration, other} + datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatauploadrestriction() + : datauploadrestriction; + + versioning = d.getVersioning() != null && compareTrust(this, e) < 0 ? d.getVersioning() : versioning; + citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 + ? d.getCitationguidelineurl() + : citationguidelineurl; + + // {yes, no, unknown} + qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 + ? d.getQualitymanagementkind() + : qualitymanagementkind; + pidsystems = d.getPidsystems() != null && compareTrust(this, e) < 0 ? d.getPidsystems() : pidsystems; + + certificates = d.getCertificates() != null && compareTrust(this, e) < 0 + ? d.getCertificates() + : certificates; + + policies = mergeLists(policies, d.getPolicies()); + + journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; + + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java index 67b48ed16a..d509b954e7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java @@ -1,115 +1,119 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; public class ExternalReference implements Serializable { - // source - private String sitename; + // source + private String sitename; - // title - private String label; + // title + private String label; - // text() - private String url; + // text() + private String url; - // ?? not mapped yet ?? - private String description; + // ?? not mapped yet ?? + private String description; - // type - private Qualifier qualifier; + // type + private Qualifier qualifier; - // site internal identifier - private String refidentifier; + // site internal identifier + private String refidentifier; - // maps the oaf:reference/@query attribute - private String query; + // maps the oaf:reference/@query attribute + private String query; - // ExternalReferences might be also inferred - private DataInfo dataInfo; + // ExternalReferences might be also inferred + private DataInfo dataInfo; - public String getSitename() { - return sitename; - } + public String getSitename() { + return sitename; + } - public void setSitename(String sitename) { - this.sitename = sitename; - } + public void setSitename(String sitename) { + this.sitename = sitename; + } - public String getLabel() { - return label; - } + public String getLabel() { + return label; + } - public void setLabel(String label) { - this.label = label; - } + public void setLabel(String label) { + this.label = label; + } - public String getUrl() { - return url; - } + public String getUrl() { + return url; + } - public void setUrl(String url) { - this.url = url; - } + public void setUrl(String url) { + this.url = url; + } - public String getDescription() { - return description; - } + public String getDescription() { + return description; + } - public void setDescription(String description) { - this.description = description; - } + public void setDescription(String description) { + this.description = description; + } - public Qualifier getQualifier() { - return qualifier; - } + public Qualifier getQualifier() { + return qualifier; + } - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } + public void setQualifier(Qualifier qualifier) { + this.qualifier = qualifier; + } - public String getRefidentifier() { - return refidentifier; - } + public String getRefidentifier() { + return refidentifier; + } - public void setRefidentifier(String refidentifier) { - this.refidentifier = refidentifier; - } + public void setRefidentifier(String refidentifier) { + this.refidentifier = refidentifier; + } - public String getQuery() { - return query; - } + public String getQuery() { + return query; + } - public void setQuery(String query) { - this.query = query; - } + public void setQuery(String query) { + this.query = query; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ExternalReference that = (ExternalReference) o; - return Objects.equals(sitename, that.sitename) - && Objects.equals(label, that.label) - && Objects.equals(url, that.url) - && Objects.equals(description, that.description) - && Objects.equals(qualifier, that.qualifier) - && Objects.equals(refidentifier, that.refidentifier) - && Objects.equals(query, that.query) - && Objects.equals(dataInfo, that.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExternalReference that = (ExternalReference) o; + return Objects.equals(sitename, that.sitename) + && Objects.equals(label, that.label) + && Objects.equals(url, that.url) + && Objects.equals(description, that.description) + && Objects.equals(qualifier, that.qualifier) + && Objects.equals(refidentifier, that.refidentifier) + && Objects.equals(query, that.query) + && Objects.equals(dataInfo, that.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash( - sitename, label, url, description, qualifier, refidentifier, query, dataInfo); - } + @Override + public int hashCode() { + return Objects + .hash( + sitename, label, url, description, qualifier, refidentifier, query, dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java index c19c08f5f2..3682cc2aa2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java @@ -1,74 +1,77 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; public class ExtraInfo implements Serializable { - private String name; + private String name; - private String typology; + private String typology; - private String provenance; + private String provenance; - private String trust; + private String trust; - // json containing a Citation or Statistics - private String value; + // json containing a Citation or Statistics + private String value; - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getTypology() { - return typology; - } + public String getTypology() { + return typology; + } - public void setTypology(String typology) { - this.typology = typology; - } + public void setTypology(String typology) { + this.typology = typology; + } - public String getProvenance() { - return provenance; - } + public String getProvenance() { + return provenance; + } - public void setProvenance(String provenance) { - this.provenance = provenance; - } + public void setProvenance(String provenance) { + this.provenance = provenance; + } - public String getTrust() { - return trust; - } + public String getTrust() { + return trust; + } - public void setTrust(String trust) { - this.trust = trust; - } + public void setTrust(String trust) { + this.trust = trust; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ExtraInfo extraInfo = (ExtraInfo) o; - return Objects.equals(name, extraInfo.name) - && Objects.equals(typology, extraInfo.typology) - && Objects.equals(provenance, extraInfo.provenance) - && Objects.equals(trust, extraInfo.trust) - && Objects.equals(value, extraInfo.value); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExtraInfo extraInfo = (ExtraInfo) o; + return Objects.equals(name, extraInfo.name) + && Objects.equals(typology, extraInfo.typology) + && Objects.equals(provenance, extraInfo.provenance) + && Objects.equals(trust, extraInfo.trust) + && Objects.equals(value, extraInfo.value); + } - @Override - public int hashCode() { - return Objects.hash(name, typology, provenance, trust, value); - } + @Override + public int hashCode() { + return Objects.hash(name, typology, provenance, trust, value); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index 1854b85c10..1a85c6842d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -1,40 +1,44 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; public class Field implements Serializable { - private T value; + private T value; - private DataInfo dataInfo; + private DataInfo dataInfo; - public T getValue() { - return value; - } + public T getValue() { + return value; + } - public void setValue(T value) { - this.value = value; - } + public void setValue(T value) { + this.value = value; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public int hashCode() { - return getValue() == null ? 0 : getValue().hashCode(); - } + @Override + public int hashCode() { + return getValue() == null ? 0 : getValue().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; - Field other = (Field) obj; - return getValue().equals(other.getValue()); - } + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Field other = (Field) obj; + return getValue().equals(other.getValue()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java index 741f19002f..7ed313a59c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java @@ -1,69 +1,76 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class GeoLocation implements Serializable { - private String point; + private String point; - private String box; + private String box; - private String place; + private String place; - public String getPoint() { - return point; - } + public String getPoint() { + return point; + } - public void setPoint(String point) { - this.point = point; - } + public void setPoint(String point) { + this.point = point; + } - public String getBox() { - return box; - } + public String getBox() { + return box; + } - public void setBox(String box) { - this.box = box; - } + public void setBox(String box) { + this.box = box; + } - public String getPlace() { - return place; - } + public String getPlace() { + return place; + } - public void setPlace(String place) { - this.place = place; - } + public void setPlace(String place) { + this.place = place; + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s%s", - point != null ? point.toLowerCase() : "", - box != null ? box.toLowerCase() : "", - place != null ? place.toLowerCase() : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s%s", + point != null ? point.toLowerCase() : "", + box != null ? box.toLowerCase() : "", + place != null ? place.toLowerCase() : ""); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - GeoLocation other = (GeoLocation) obj; + GeoLocation other = (GeoLocation) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 76b72cfbc9..2b7d3846c0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,143 +6,147 @@ import java.util.List; public class Instance implements Serializable { - private Field license; + private Field license; - private Qualifier accessright; + private Qualifier accessright; - private Qualifier instancetype; + private Qualifier instancetype; - private KeyValue hostedby; + private KeyValue hostedby; - private List url; + private List url; - // other research products specifc - private String distributionlocation; + // other research products specifc + private String distributionlocation; - private KeyValue collectedfrom; + private KeyValue collectedfrom; - private Field dateofacceptance; + private Field dateofacceptance; - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed - // results - private Field processingchargeamount; + // ( article | book ) processing charges. Defined here to cope with possible wrongly typed + // results + private Field processingchargeamount; - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly - // typed results - private Field processingchargecurrency; + // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly + // typed results + private Field processingchargecurrency; - private Field refereed; // peer-review status + private Field refereed; // peer-review status - public Field getLicense() { - return license; - } + public Field getLicense() { + return license; + } - public void setLicense(Field license) { - this.license = license; - } + public void setLicense(Field license) { + this.license = license; + } - public Qualifier getAccessright() { - return accessright; - } + public Qualifier getAccessright() { + return accessright; + } - public void setAccessright(Qualifier accessright) { - this.accessright = accessright; - } + public void setAccessright(Qualifier accessright) { + this.accessright = accessright; + } - public Qualifier getInstancetype() { - return instancetype; - } + public Qualifier getInstancetype() { + return instancetype; + } - public void setInstancetype(Qualifier instancetype) { - this.instancetype = instancetype; - } + public void setInstancetype(Qualifier instancetype) { + this.instancetype = instancetype; + } - public KeyValue getHostedby() { - return hostedby; - } + public KeyValue getHostedby() { + return hostedby; + } - public void setHostedby(KeyValue hostedby) { - this.hostedby = hostedby; - } + public void setHostedby(KeyValue hostedby) { + this.hostedby = hostedby; + } - public List getUrl() { - return url; - } + public List getUrl() { + return url; + } - public void setUrl(List url) { - this.url = url; - } + public void setUrl(List url) { + this.url = url; + } - public String getDistributionlocation() { - return distributionlocation; - } + public String getDistributionlocation() { + return distributionlocation; + } - public void setDistributionlocation(String distributionlocation) { - this.distributionlocation = distributionlocation; - } + public void setDistributionlocation(String distributionlocation) { + this.distributionlocation = distributionlocation; + } - public KeyValue getCollectedfrom() { - return collectedfrom; - } + public KeyValue getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(KeyValue collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(KeyValue collectedfrom) { + this.collectedfrom = collectedfrom; + } - public Field getDateofacceptance() { - return dateofacceptance; - } + public Field getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(Field dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public Field getProcessingchargeamount() { - return processingchargeamount; - } + public Field getProcessingchargeamount() { + return processingchargeamount; + } - public void setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - } + public void setProcessingchargeamount(Field processingchargeamount) { + this.processingchargeamount = processingchargeamount; + } - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } + public Field getProcessingchargecurrency() { + return processingchargecurrency; + } - public void setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - } + public void setProcessingchargecurrency(Field processingchargecurrency) { + this.processingchargecurrency = processingchargecurrency; + } - public Field getRefereed() { - return refereed; - } + public Field getRefereed() { + return refereed; + } - public void setRefereed(Field refereed) { - this.refereed = refereed; - } + public void setRefereed(Field refereed) { + this.refereed = refereed; + } - public String toComparableString() { - return String.format( - "%s::%s::%s::%s", - hostedby != null && hostedby.getKey() != null ? hostedby.getKey().toLowerCase() : "", - accessright != null && accessright.getClassid() != null ? accessright.getClassid() : "", - instancetype != null && instancetype.getClassid() != null ? instancetype.getClassid() : "", - url != null ? url : ""); - } + public String toComparableString() { + return String + .format( + "%s::%s::%s::%s", + hostedby != null && hostedby.getKey() != null ? hostedby.getKey().toLowerCase() : "", + accessright != null && accessright.getClassid() != null ? accessright.getClassid() : "", + instancetype != null && instancetype.getClassid() != null ? instancetype.getClassid() : "", + url != null ? url : ""); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Instance other = (Instance) obj; + Instance other = (Instance) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java index bdf64f8124..7a375e28bc 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,159 +6,162 @@ import java.util.Objects; public class Journal implements Serializable { - private String name; + private String name; - private String issnPrinted; + private String issnPrinted; - private String issnOnline; + private String issnOnline; - private String issnLinking; + private String issnLinking; - private String ep; + private String ep; - private String iss; + private String iss; - private String sp; + private String sp; - private String vol; + private String vol; - private String edition; + private String edition; - private String conferenceplace; + private String conferenceplace; - private String conferencedate; + private String conferencedate; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getIssnPrinted() { - return issnPrinted; - } + public String getIssnPrinted() { + return issnPrinted; + } - public void setIssnPrinted(String issnPrinted) { - this.issnPrinted = issnPrinted; - } + public void setIssnPrinted(String issnPrinted) { + this.issnPrinted = issnPrinted; + } - public String getIssnOnline() { - return issnOnline; - } + public String getIssnOnline() { + return issnOnline; + } - public void setIssnOnline(String issnOnline) { - this.issnOnline = issnOnline; - } + public void setIssnOnline(String issnOnline) { + this.issnOnline = issnOnline; + } - public String getIssnLinking() { - return issnLinking; - } + public String getIssnLinking() { + return issnLinking; + } - public void setIssnLinking(String issnLinking) { - this.issnLinking = issnLinking; - } + public void setIssnLinking(String issnLinking) { + this.issnLinking = issnLinking; + } - public String getEp() { - return ep; - } + public String getEp() { + return ep; + } - public void setEp(String ep) { - this.ep = ep; - } + public void setEp(String ep) { + this.ep = ep; + } - public String getIss() { - return iss; - } + public String getIss() { + return iss; + } - public void setIss(String iss) { - this.iss = iss; - } + public void setIss(String iss) { + this.iss = iss; + } - public String getSp() { - return sp; - } + public String getSp() { + return sp; + } - public void setSp(String sp) { - this.sp = sp; - } + public void setSp(String sp) { + this.sp = sp; + } - public String getVol() { - return vol; - } + public String getVol() { + return vol; + } - public void setVol(String vol) { - this.vol = vol; - } + public void setVol(String vol) { + this.vol = vol; + } - public String getEdition() { - return edition; - } + public String getEdition() { + return edition; + } - public void setEdition(String edition) { - this.edition = edition; - } + public void setEdition(String edition) { + this.edition = edition; + } - public String getConferenceplace() { - return conferenceplace; - } + public String getConferenceplace() { + return conferenceplace; + } - public void setConferenceplace(String conferenceplace) { - this.conferenceplace = conferenceplace; - } + public void setConferenceplace(String conferenceplace) { + this.conferenceplace = conferenceplace; + } - public String getConferencedate() { - return conferencedate; - } + public String getConferencedate() { + return conferencedate; + } - public void setConferencedate(String conferencedate) { - this.conferencedate = conferencedate; - } + public void setConferencedate(String conferencedate) { + this.conferencedate = conferencedate; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Journal journal = (Journal) o; - return Objects.equals(name, journal.name) - && Objects.equals(issnPrinted, journal.issnPrinted) - && Objects.equals(issnOnline, journal.issnOnline) - && Objects.equals(issnLinking, journal.issnLinking) - && Objects.equals(ep, journal.ep) - && Objects.equals(iss, journal.iss) - && Objects.equals(sp, journal.sp) - && Objects.equals(vol, journal.vol) - && Objects.equals(edition, journal.edition) - && Objects.equals(conferenceplace, journal.conferenceplace) - && Objects.equals(conferencedate, journal.conferencedate) - && Objects.equals(dataInfo, journal.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Journal journal = (Journal) o; + return Objects.equals(name, journal.name) + && Objects.equals(issnPrinted, journal.issnPrinted) + && Objects.equals(issnOnline, journal.issnOnline) + && Objects.equals(issnLinking, journal.issnLinking) + && Objects.equals(ep, journal.ep) + && Objects.equals(iss, journal.iss) + && Objects.equals(sp, journal.sp) + && Objects.equals(vol, journal.vol) + && Objects.equals(edition, journal.edition) + && Objects.equals(conferenceplace, journal.conferenceplace) + && Objects.equals(conferencedate, journal.conferencedate) + && Objects.equals(dataInfo, journal.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - conferenceplace, - conferencedate, - dataInfo); - } + @Override + public int hashCode() { + return Objects + .hash( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + conferenceplace, + conferencedate, + dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 31b898788d..4e2d601387 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -1,67 +1,74 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class KeyValue implements Serializable { - private String key; + private String key; - private String value; + private String value; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public void setKey(String key) { - this.key = key; - } + public void setKey(String key) { + this.key = key; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s", - key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s", + key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(key) && StringUtils.isBlank(value); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(key) && StringUtils.isBlank(value); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - KeyValue other = (KeyValue) obj; + KeyValue other = (KeyValue) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java index 5798adae97..88d74afbfd 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,26 +6,28 @@ import java.util.Objects; public class OAIProvenance implements Serializable { - private OriginDescription originDescription; + private OriginDescription originDescription; - public OriginDescription getOriginDescription() { - return originDescription; - } + public OriginDescription getOriginDescription() { + return originDescription; + } - public void setOriginDescription(OriginDescription originDescription) { - this.originDescription = originDescription; - } + public void setOriginDescription(OriginDescription originDescription) { + this.originDescription = originDescription; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - OAIProvenance that = (OAIProvenance) o; - return Objects.equals(originDescription, that.originDescription); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + OAIProvenance that = (OAIProvenance) o; + return Objects.equals(originDescription, that.originDescription); + } - @Override - public int hashCode() { - return Objects.hash(originDescription); - } + @Override + public int hashCode() { + return Objects.hash(originDescription); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java index d6561f5cb5..4bfc05039d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,60 +7,64 @@ import java.util.Objects; public abstract class Oaf implements Serializable { - protected List collectedfrom; + protected List collectedfrom; - private DataInfo dataInfo; + private DataInfo dataInfo; - private Long lastupdatetimestamp; + private Long lastupdatetimestamp; - public List getCollectedfrom() { - return collectedfrom; - } + public List getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public Long getLastupdatetimestamp() { - return lastupdatetimestamp; - } + public Long getLastupdatetimestamp() { + return lastupdatetimestamp; + } - public void setLastupdatetimestamp(Long lastupdatetimestamp) { - this.lastupdatetimestamp = lastupdatetimestamp; - } + public void setLastupdatetimestamp(Long lastupdatetimestamp) { + this.lastupdatetimestamp = lastupdatetimestamp; + } - public void mergeOAFDataInfo(Oaf e) { - if (e.getDataInfo() != null && compareTrust(this, e) < 0) dataInfo = e.getDataInfo(); - } + public void mergeOAFDataInfo(Oaf e) { + if (e.getDataInfo() != null && compareTrust(this, e) < 0) + dataInfo = e.getDataInfo(); + } - protected String extractTrust(Oaf e) { - if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) return "0.0"; - return e.getDataInfo().getTrust(); - } + protected String extractTrust(Oaf e) { + if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) + return "0.0"; + return e.getDataInfo().getTrust(); + } - protected int compareTrust(Oaf a, Oaf b) { - return extractTrust(a).compareTo(extractTrust(b)); - } + protected int compareTrust(Oaf a, Oaf b) { + return extractTrust(a).compareTo(extractTrust(b)); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Oaf oaf = (Oaf) o; - return Objects.equals(dataInfo, oaf.dataInfo) - && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Oaf oaf = (Oaf) o; + return Objects.equals(dataInfo, oaf.dataInfo) + && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); + } - @Override - public int hashCode() { - return Objects.hash(dataInfo, lastupdatetimestamp); - } + @Override + public int hashCode() { + return Objects.hash(dataInfo, lastupdatetimestamp); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java index fbc73bb0ad..09742748d7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,118 +7,123 @@ import java.util.stream.Collectors; public abstract class OafEntity extends Oaf implements Serializable { - private String id; + private String id; - private List originalId; + private List originalId; - private List pid; + private List pid; - private String dateofcollection; + private String dateofcollection; - private String dateoftransformation; + private String dateoftransformation; - private List extraInfo; + private List extraInfo; - private OAIProvenance oaiprovenance; + private OAIProvenance oaiprovenance; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getOriginalId() { - return originalId; - } + public List getOriginalId() { + return originalId; + } - public void setOriginalId(List originalId) { - this.originalId = originalId; - } + public void setOriginalId(List originalId) { + this.originalId = originalId; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getDateofcollection() { - return dateofcollection; - } + public String getDateofcollection() { + return dateofcollection; + } - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } + public void setDateofcollection(String dateofcollection) { + this.dateofcollection = dateofcollection; + } - public String getDateoftransformation() { - return dateoftransformation; - } + public String getDateoftransformation() { + return dateoftransformation; + } - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } + public void setDateoftransformation(String dateoftransformation) { + this.dateoftransformation = dateoftransformation; + } - public List getExtraInfo() { - return extraInfo; - } + public List getExtraInfo() { + return extraInfo; + } - public void setExtraInfo(List extraInfo) { - this.extraInfo = extraInfo; - } + public void setExtraInfo(List extraInfo) { + this.extraInfo = extraInfo; + } - public OAIProvenance getOaiprovenance() { - return oaiprovenance; - } + public OAIProvenance getOaiprovenance() { + return oaiprovenance; + } - public void setOaiprovenance(OAIProvenance oaiprovenance) { - this.oaiprovenance = oaiprovenance; - } + public void setOaiprovenance(OAIProvenance oaiprovenance) { + this.oaiprovenance = oaiprovenance; + } - public void mergeFrom(OafEntity e) { + public void mergeFrom(OafEntity e) { - if (e == null) return; + if (e == null) + return; - originalId = mergeLists(originalId, e.getOriginalId()); + originalId = mergeLists(originalId, e.getOriginalId()); - collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); + collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); - pid = mergeLists(pid, e.getPid()); + pid = mergeLists(pid, e.getPid()); - if (e.getDateofcollection() != null && compareTrust(this, e) < 0) - dateofcollection = e.getDateofcollection(); + if (e.getDateofcollection() != null && compareTrust(this, e) < 0) + dateofcollection = e.getDateofcollection(); - if (e.getDateoftransformation() != null && compareTrust(this, e) < 0) - dateoftransformation = e.getDateoftransformation(); + if (e.getDateoftransformation() != null && compareTrust(this, e) < 0) + dateoftransformation = e.getDateoftransformation(); - extraInfo = mergeLists(extraInfo, e.getExtraInfo()); + extraInfo = mergeLists(extraInfo, e.getExtraInfo()); - if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) - oaiprovenance = e.getOaiprovenance(); - } + if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) + oaiprovenance = e.getOaiprovenance(); + } - protected List mergeLists(final List... lists) { + protected List mergeLists(final List... lists) { - return Arrays.stream(lists) - .filter(Objects::nonNull) - .flatMap(List::stream) - .distinct() - .collect(Collectors.toList()); - } + return Arrays + .stream(lists) + .filter(Objects::nonNull) + .flatMap(List::stream) + .distinct() + .collect(Collectors.toList()); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - if (!super.equals(o)) return false; - OafEntity oafEntity = (OafEntity) o; - return Objects.equals(id, oafEntity.id); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + if (!super.equals(o)) + return false; + OafEntity oafEntity = (OafEntity) o; + return Objects.equals(id, oafEntity.id); + } - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), id); - } + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), id); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java index 4339ff5b4d..a5f9bce308 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,221 +6,209 @@ import java.util.List; public class Organization extends OafEntity implements Serializable { - private Field legalshortname; + private Field legalshortname; - private Field legalname; + private Field legalname; - private List> alternativeNames; + private List> alternativeNames; - private Field websiteurl; + private Field websiteurl; - private Field logourl; + private Field logourl; - private Field eclegalbody; + private Field eclegalbody; - private Field eclegalperson; + private Field eclegalperson; - private Field ecnonprofit; + private Field ecnonprofit; - private Field ecresearchorganization; + private Field ecresearchorganization; - private Field echighereducation; + private Field echighereducation; - private Field ecinternationalorganizationeurinterests; + private Field ecinternationalorganizationeurinterests; - private Field ecinternationalorganization; + private Field ecinternationalorganization; - private Field ecenterprise; + private Field ecenterprise; - private Field ecsmevalidated; + private Field ecsmevalidated; - private Field ecnutscode; + private Field ecnutscode; - private Qualifier country; + private Qualifier country; - public Field getLegalshortname() { - return legalshortname; - } + public Field getLegalshortname() { + return legalshortname; + } - public void setLegalshortname(Field legalshortname) { - this.legalshortname = legalshortname; - } + public void setLegalshortname(Field legalshortname) { + this.legalshortname = legalshortname; + } - public Field getLegalname() { - return legalname; - } + public Field getLegalname() { + return legalname; + } - public void setLegalname(Field legalname) { - this.legalname = legalname; - } + public void setLegalname(Field legalname) { + this.legalname = legalname; + } - public List> getAlternativeNames() { - return alternativeNames; - } + public List> getAlternativeNames() { + return alternativeNames; + } - public void setAlternativeNames(List> alternativeNames) { - this.alternativeNames = alternativeNames; - } + public void setAlternativeNames(List> alternativeNames) { + this.alternativeNames = alternativeNames; + } - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getLogourl() { - return logourl; - } + public Field getLogourl() { + return logourl; + } - public void setLogourl(Field logourl) { - this.logourl = logourl; - } + public void setLogourl(Field logourl) { + this.logourl = logourl; + } - public Field getEclegalbody() { - return eclegalbody; - } + public Field getEclegalbody() { + return eclegalbody; + } - public void setEclegalbody(Field eclegalbody) { - this.eclegalbody = eclegalbody; - } + public void setEclegalbody(Field eclegalbody) { + this.eclegalbody = eclegalbody; + } - public Field getEclegalperson() { - return eclegalperson; - } + public Field getEclegalperson() { + return eclegalperson; + } - public void setEclegalperson(Field eclegalperson) { - this.eclegalperson = eclegalperson; - } + public void setEclegalperson(Field eclegalperson) { + this.eclegalperson = eclegalperson; + } - public Field getEcnonprofit() { - return ecnonprofit; - } + public Field getEcnonprofit() { + return ecnonprofit; + } - public void setEcnonprofit(Field ecnonprofit) { - this.ecnonprofit = ecnonprofit; - } + public void setEcnonprofit(Field ecnonprofit) { + this.ecnonprofit = ecnonprofit; + } - public Field getEcresearchorganization() { - return ecresearchorganization; - } + public Field getEcresearchorganization() { + return ecresearchorganization; + } - public void setEcresearchorganization(Field ecresearchorganization) { - this.ecresearchorganization = ecresearchorganization; - } + public void setEcresearchorganization(Field ecresearchorganization) { + this.ecresearchorganization = ecresearchorganization; + } - public Field getEchighereducation() { - return echighereducation; - } + public Field getEchighereducation() { + return echighereducation; + } - public void setEchighereducation(Field echighereducation) { - this.echighereducation = echighereducation; - } + public void setEchighereducation(Field echighereducation) { + this.echighereducation = echighereducation; + } - public Field getEcinternationalorganizationeurinterests() { - return ecinternationalorganizationeurinterests; - } + public Field getEcinternationalorganizationeurinterests() { + return ecinternationalorganizationeurinterests; + } - public void setEcinternationalorganizationeurinterests( - Field ecinternationalorganizationeurinterests) { - this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; - } + public void setEcinternationalorganizationeurinterests( + Field ecinternationalorganizationeurinterests) { + this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; + } - public Field getEcinternationalorganization() { - return ecinternationalorganization; - } + public Field getEcinternationalorganization() { + return ecinternationalorganization; + } - public void setEcinternationalorganization(Field ecinternationalorganization) { - this.ecinternationalorganization = ecinternationalorganization; - } + public void setEcinternationalorganization(Field ecinternationalorganization) { + this.ecinternationalorganization = ecinternationalorganization; + } - public Field getEcenterprise() { - return ecenterprise; - } + public Field getEcenterprise() { + return ecenterprise; + } - public void setEcenterprise(Field ecenterprise) { - this.ecenterprise = ecenterprise; - } + public void setEcenterprise(Field ecenterprise) { + this.ecenterprise = ecenterprise; + } - public Field getEcsmevalidated() { - return ecsmevalidated; - } + public Field getEcsmevalidated() { + return ecsmevalidated; + } - public void setEcsmevalidated(Field ecsmevalidated) { - this.ecsmevalidated = ecsmevalidated; - } + public void setEcsmevalidated(Field ecsmevalidated) { + this.ecsmevalidated = ecsmevalidated; + } - public Field getEcnutscode() { - return ecnutscode; - } + public Field getEcnutscode() { + return ecnutscode; + } - public void setEcnutscode(Field ecnutscode) { - this.ecnutscode = ecnutscode; - } + public void setEcnutscode(Field ecnutscode) { + this.ecnutscode = ecnutscode; + } - public Qualifier getCountry() { - return country; - } + public Qualifier getCountry() { + return country; + } - public void setCountry(Qualifier country) { - this.country = country; - } + public void setCountry(Qualifier country) { + this.country = country; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Organization.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Organization.class.isAssignableFrom(e.getClass())) { + return; + } - final Organization o = (Organization) e; - legalshortname = - o.getLegalshortname() != null && compareTrust(this, e) < 0 - ? o.getLegalshortname() - : legalshortname; - legalname = - o.getLegalname() != null && compareTrust(this, e) < 0 ? o.getLegalname() : legalname; - alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); - websiteurl = - o.getWebsiteurl() != null && compareTrust(this, e) < 0 ? o.getWebsiteurl() : websiteurl; - logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; - eclegalbody = - o.getEclegalbody() != null && compareTrust(this, e) < 0 ? o.getEclegalbody() : eclegalbody; - eclegalperson = - o.getEclegalperson() != null && compareTrust(this, e) < 0 - ? o.getEclegalperson() - : eclegalperson; - ecnonprofit = - o.getEcnonprofit() != null && compareTrust(this, e) < 0 ? o.getEcnonprofit() : ecnonprofit; - ecresearchorganization = - o.getEcresearchorganization() != null && compareTrust(this, e) < 0 - ? o.getEcresearchorganization() - : ecresearchorganization; - echighereducation = - o.getEchighereducation() != null && compareTrust(this, e) < 0 - ? o.getEchighereducation() - : echighereducation; - ecinternationalorganizationeurinterests = - o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e) < 0 - ? o.getEcinternationalorganizationeurinterests() - : ecinternationalorganizationeurinterests; - ecinternationalorganization = - o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 - ? o.getEcinternationalorganization() - : ecinternationalorganization; - ecenterprise = - o.getEcenterprise() != null && compareTrust(this, e) < 0 - ? o.getEcenterprise() - : ecenterprise; - ecsmevalidated = - o.getEcsmevalidated() != null && compareTrust(this, e) < 0 - ? o.getEcsmevalidated() - : ecsmevalidated; - ecnutscode = - o.getEcnutscode() != null && compareTrust(this, e) < 0 ? o.getEcnutscode() : ecnutscode; - country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; - mergeOAFDataInfo(o); - } + final Organization o = (Organization) e; + legalshortname = o.getLegalshortname() != null && compareTrust(this, e) < 0 + ? o.getLegalshortname() + : legalshortname; + legalname = o.getLegalname() != null && compareTrust(this, e) < 0 ? o.getLegalname() : legalname; + alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); + websiteurl = o.getWebsiteurl() != null && compareTrust(this, e) < 0 ? o.getWebsiteurl() : websiteurl; + logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; + eclegalbody = o.getEclegalbody() != null && compareTrust(this, e) < 0 ? o.getEclegalbody() : eclegalbody; + eclegalperson = o.getEclegalperson() != null && compareTrust(this, e) < 0 + ? o.getEclegalperson() + : eclegalperson; + ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e) < 0 ? o.getEcnonprofit() : ecnonprofit; + ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e) < 0 + ? o.getEcresearchorganization() + : ecresearchorganization; + echighereducation = o.getEchighereducation() != null && compareTrust(this, e) < 0 + ? o.getEchighereducation() + : echighereducation; + ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null + && compareTrust(this, e) < 0 + ? o.getEcinternationalorganizationeurinterests() + : ecinternationalorganizationeurinterests; + ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 + ? o.getEcinternationalorganization() + : ecinternationalorganization; + ecenterprise = o.getEcenterprise() != null && compareTrust(this, e) < 0 + ? o.getEcenterprise() + : ecenterprise; + ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e) < 0 + ? o.getEcsmevalidated() + : ecsmevalidated; + ecnutscode = o.getEcnutscode() != null && compareTrust(this, e) < 0 ? o.getEcnutscode() : ecnutscode; + country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; + mergeOAFDataInfo(o); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java index 5bdabb5589..a275fc1a92 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,81 +6,83 @@ import java.util.Objects; public class OriginDescription implements Serializable { - private String harvestDate; + private String harvestDate; - private Boolean altered = true; + private Boolean altered = true; - private String baseURL; + private String baseURL; - private String identifier; + private String identifier; - private String datestamp; + private String datestamp; - private String metadataNamespace; + private String metadataNamespace; - public String getHarvestDate() { - return harvestDate; - } + public String getHarvestDate() { + return harvestDate; + } - public void setHarvestDate(String harvestDate) { - this.harvestDate = harvestDate; - } + public void setHarvestDate(String harvestDate) { + this.harvestDate = harvestDate; + } - public Boolean getAltered() { - return altered; - } + public Boolean getAltered() { + return altered; + } - public void setAltered(Boolean altered) { - this.altered = altered; - } + public void setAltered(Boolean altered) { + this.altered = altered; + } - public String getBaseURL() { - return baseURL; - } + public String getBaseURL() { + return baseURL; + } - public void setBaseURL(String baseURL) { - this.baseURL = baseURL; - } + public void setBaseURL(String baseURL) { + this.baseURL = baseURL; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } - public String getDatestamp() { - return datestamp; - } + public String getDatestamp() { + return datestamp; + } - public void setDatestamp(String datestamp) { - this.datestamp = datestamp; - } + public void setDatestamp(String datestamp) { + this.datestamp = datestamp; + } - public String getMetadataNamespace() { - return metadataNamespace; - } + public String getMetadataNamespace() { + return metadataNamespace; + } - public void setMetadataNamespace(String metadataNamespace) { - this.metadataNamespace = metadataNamespace; - } + public void setMetadataNamespace(String metadataNamespace) { + this.metadataNamespace = metadataNamespace; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - OriginDescription that = (OriginDescription) o; - return Objects.equals(harvestDate, that.harvestDate) - && Objects.equals(altered, that.altered) - && Objects.equals(baseURL, that.baseURL) - && Objects.equals(identifier, that.identifier) - && Objects.equals(datestamp, that.datestamp) - && Objects.equals(metadataNamespace, that.metadataNamespace); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + OriginDescription that = (OriginDescription) o; + return Objects.equals(harvestDate, that.harvestDate) + && Objects.equals(altered, that.altered) + && Objects.equals(baseURL, that.baseURL) + && Objects.equals(identifier, that.identifier) + && Objects.equals(datestamp, that.datestamp) + && Objects.equals(metadataNamespace, that.metadataNamespace); + } - @Override - public int hashCode() { - return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); - } + @Override + public int hashCode() { + return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java index 6cd803ec55..b04934c235 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java @@ -1,58 +1,60 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class OtherResearchProduct extends Result implements Serializable { - private List> contactperson; + private List> contactperson; - private List> contactgroup; + private List> contactgroup; - private List> tool; + private List> tool; - public OtherResearchProduct() { - setResulttype(ModelConstants.ORP_DEFAULT_RESULTTYPE); - } + public OtherResearchProduct() { + setResulttype(ModelConstants.ORP_DEFAULT_RESULTTYPE); + } - public List> getContactperson() { - return contactperson; - } + public List> getContactperson() { + return contactperson; + } - public void setContactperson(List> contactperson) { - this.contactperson = contactperson; - } + public void setContactperson(List> contactperson) { + this.contactperson = contactperson; + } - public List> getContactgroup() { - return contactgroup; - } + public List> getContactgroup() { + return contactgroup; + } - public void setContactgroup(List> contactgroup) { - this.contactgroup = contactgroup; - } + public void setContactgroup(List> contactgroup) { + this.contactgroup = contactgroup; + } - public List> getTool() { - return tool; - } + public List> getTool() { + return tool; + } - public void setTool(List> tool) { - this.tool = tool; - } + public void setTool(List> tool) { + this.tool = tool; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { - return; - } + if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { + return; + } - OtherResearchProduct o = (OtherResearchProduct) e; + OtherResearchProduct o = (OtherResearchProduct) e; - contactperson = mergeLists(contactperson, o.getContactperson()); - contactgroup = mergeLists(contactgroup, o.getContactgroup()); - tool = mergeLists(tool, o.getTool()); - mergeOAFDataInfo(e); - } + contactperson = mergeLists(contactperson, o.getContactperson()); + contactgroup = mergeLists(contactgroup, o.getContactgroup()); + tool = mergeLists(tool, o.getTool()); + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java index 2187a88280..924c08cc9c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,335 +6,320 @@ import java.util.List; public class Project extends OafEntity implements Serializable { - private Field websiteurl; + private Field websiteurl; - private Field code; + private Field code; - private Field acronym; + private Field acronym; - private Field title; + private Field title; - private Field startdate; + private Field startdate; - private Field enddate; + private Field enddate; - private Field callidentifier; + private Field callidentifier; - private Field keywords; + private Field keywords; - private Field duration; + private Field duration; - private Field ecsc39; + private Field ecsc39; - private Field oamandatepublications; + private Field oamandatepublications; - private Field ecarticle29_3; + private Field ecarticle29_3; - private List subjects; + private List subjects; - private List> fundingtree; + private List> fundingtree; - private Qualifier contracttype; + private Qualifier contracttype; - private Field optional1; + private Field optional1; - private Field optional2; + private Field optional2; - private Field jsonextrainfo; + private Field jsonextrainfo; - private Field contactfullname; + private Field contactfullname; - private Field contactfax; + private Field contactfax; - private Field contactphone; + private Field contactphone; - private Field contactemail; + private Field contactemail; - private Field summary; + private Field summary; - private Field currency; + private Field currency; - private Float totalcost; + private Float totalcost; - private Float fundedamount; + private Float fundedamount; - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getCode() { - return code; - } + public Field getCode() { + return code; + } - public void setCode(Field code) { - this.code = code; - } + public void setCode(Field code) { + this.code = code; + } - public Field getAcronym() { - return acronym; - } + public Field getAcronym() { + return acronym; + } - public void setAcronym(Field acronym) { - this.acronym = acronym; - } + public void setAcronym(Field acronym) { + this.acronym = acronym; + } - public Field getTitle() { - return title; - } + public Field getTitle() { + return title; + } - public void setTitle(Field title) { - this.title = title; - } + public void setTitle(Field title) { + this.title = title; + } - public Field getStartdate() { - return startdate; - } + public Field getStartdate() { + return startdate; + } - public void setStartdate(Field startdate) { - this.startdate = startdate; - } + public void setStartdate(Field startdate) { + this.startdate = startdate; + } - public Field getEnddate() { - return enddate; - } + public Field getEnddate() { + return enddate; + } - public void setEnddate(Field enddate) { - this.enddate = enddate; - } + public void setEnddate(Field enddate) { + this.enddate = enddate; + } - public Field getCallidentifier() { - return callidentifier; - } + public Field getCallidentifier() { + return callidentifier; + } - public void setCallidentifier(Field callidentifier) { - this.callidentifier = callidentifier; - } + public void setCallidentifier(Field callidentifier) { + this.callidentifier = callidentifier; + } - public Field getKeywords() { - return keywords; - } + public Field getKeywords() { + return keywords; + } - public void setKeywords(Field keywords) { - this.keywords = keywords; - } + public void setKeywords(Field keywords) { + this.keywords = keywords; + } - public Field getDuration() { - return duration; - } + public Field getDuration() { + return duration; + } - public void setDuration(Field duration) { - this.duration = duration; - } + public void setDuration(Field duration) { + this.duration = duration; + } - public Field getEcsc39() { - return ecsc39; - } + public Field getEcsc39() { + return ecsc39; + } - public void setEcsc39(Field ecsc39) { - this.ecsc39 = ecsc39; - } + public void setEcsc39(Field ecsc39) { + this.ecsc39 = ecsc39; + } - public Field getOamandatepublications() { - return oamandatepublications; - } + public Field getOamandatepublications() { + return oamandatepublications; + } - public void setOamandatepublications(Field oamandatepublications) { - this.oamandatepublications = oamandatepublications; - } + public void setOamandatepublications(Field oamandatepublications) { + this.oamandatepublications = oamandatepublications; + } - public Field getEcarticle29_3() { - return ecarticle29_3; - } + public Field getEcarticle29_3() { + return ecarticle29_3; + } - public void setEcarticle29_3(Field ecarticle29_3) { - this.ecarticle29_3 = ecarticle29_3; - } + public void setEcarticle29_3(Field ecarticle29_3) { + this.ecarticle29_3 = ecarticle29_3; + } - public List getSubjects() { - return subjects; - } + public List getSubjects() { + return subjects; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public List> getFundingtree() { - return fundingtree; - } + public List> getFundingtree() { + return fundingtree; + } - public void setFundingtree(List> fundingtree) { - this.fundingtree = fundingtree; - } + public void setFundingtree(List> fundingtree) { + this.fundingtree = fundingtree; + } - public Qualifier getContracttype() { - return contracttype; - } + public Qualifier getContracttype() { + return contracttype; + } - public void setContracttype(Qualifier contracttype) { - this.contracttype = contracttype; - } + public void setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + } - public Field getOptional1() { - return optional1; - } + public Field getOptional1() { + return optional1; + } - public void setOptional1(Field optional1) { - this.optional1 = optional1; - } - - public Field getOptional2() { - return optional2; - } - - public void setOptional2(Field optional2) { - this.optional2 = optional2; - } - - public Field getJsonextrainfo() { - return jsonextrainfo; - } - - public void setJsonextrainfo(Field jsonextrainfo) { - this.jsonextrainfo = jsonextrainfo; - } - - public Field getContactfullname() { - return contactfullname; - } - - public void setContactfullname(Field contactfullname) { - this.contactfullname = contactfullname; - } - - public Field getContactfax() { - return contactfax; - } - - public void setContactfax(Field contactfax) { - this.contactfax = contactfax; - } - - public Field getContactphone() { - return contactphone; - } - - public void setContactphone(Field contactphone) { - this.contactphone = contactphone; - } - - public Field getContactemail() { - return contactemail; - } - - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } - - public Field getSummary() { - return summary; - } - - public void setSummary(Field summary) { - this.summary = summary; - } - - public Field getCurrency() { - return currency; - } - - public void setCurrency(Field currency) { - this.currency = currency; - } - - public Float getTotalcost() { - return totalcost; - } - - public void setTotalcost(Float totalcost) { - this.totalcost = totalcost; - } - - public Float getFundedamount() { - return fundedamount; - } - - public void setFundedamount(Float fundedamount) { - this.fundedamount = fundedamount; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Project.class.isAssignableFrom(e.getClass())) { - return; - } - - Project p = (Project) e; - - websiteurl = - p.getWebsiteurl() != null && compareTrust(this, e) < 0 ? p.getWebsiteurl() : websiteurl; - code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; - acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; - title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; - startdate = - p.getStartdate() != null && compareTrust(this, e) < 0 ? p.getStartdate() : startdate; - enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; - callidentifier = - p.getCallidentifier() != null && compareTrust(this, e) < 0 - ? p.getCallidentifier() - : callidentifier; - keywords = p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; - duration = p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; - ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; - oamandatepublications = - p.getOamandatepublications() != null && compareTrust(this, e) < 0 - ? p.getOamandatepublications() - : oamandatepublications; - ecarticle29_3 = - p.getEcarticle29_3() != null && compareTrust(this, e) < 0 - ? p.getEcarticle29_3() - : ecarticle29_3; - subjects = mergeLists(subjects, p.getSubjects()); - fundingtree = mergeLists(fundingtree, p.getFundingtree()); - contracttype = - p.getContracttype() != null && compareTrust(this, e) < 0 - ? p.getContracttype() - : contracttype; - optional1 = - p.getOptional1() != null && compareTrust(this, e) < 0 ? p.getOptional1() : optional1; - optional2 = - p.getOptional2() != null && compareTrust(this, e) < 0 ? p.getOptional2() : optional2; - jsonextrainfo = - p.getJsonextrainfo() != null && compareTrust(this, e) < 0 - ? p.getJsonextrainfo() - : jsonextrainfo; - contactfullname = - p.getContactfullname() != null && compareTrust(this, e) < 0 - ? p.getContactfullname() - : contactfullname; - contactfax = - p.getContactfax() != null && compareTrust(this, e) < 0 ? p.getContactfax() : contactfax; - contactphone = - p.getContactphone() != null && compareTrust(this, e) < 0 - ? p.getContactphone() - : contactphone; - contactemail = - p.getContactemail() != null && compareTrust(this, e) < 0 - ? p.getContactemail() - : contactemail; - summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; - currency = p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; - totalcost = - p.getTotalcost() != null && compareTrust(this, e) < 0 ? p.getTotalcost() : totalcost; - fundedamount = - p.getFundedamount() != null && compareTrust(this, e) < 0 - ? p.getFundedamount() - : fundedamount; - mergeOAFDataInfo(e); - } + public void setOptional1(Field optional1) { + this.optional1 = optional1; + } + + public Field getOptional2() { + return optional2; + } + + public void setOptional2(Field optional2) { + this.optional2 = optional2; + } + + public Field getJsonextrainfo() { + return jsonextrainfo; + } + + public void setJsonextrainfo(Field jsonextrainfo) { + this.jsonextrainfo = jsonextrainfo; + } + + public Field getContactfullname() { + return contactfullname; + } + + public void setContactfullname(Field contactfullname) { + this.contactfullname = contactfullname; + } + + public Field getContactfax() { + return contactfax; + } + + public void setContactfax(Field contactfax) { + this.contactfax = contactfax; + } + + public Field getContactphone() { + return contactphone; + } + + public void setContactphone(Field contactphone) { + this.contactphone = contactphone; + } + + public Field getContactemail() { + return contactemail; + } + + public void setContactemail(Field contactemail) { + this.contactemail = contactemail; + } + + public Field getSummary() { + return summary; + } + + public void setSummary(Field summary) { + this.summary = summary; + } + + public Field getCurrency() { + return currency; + } + + public void setCurrency(Field currency) { + this.currency = currency; + } + + public Float getTotalcost() { + return totalcost; + } + + public void setTotalcost(Float totalcost) { + this.totalcost = totalcost; + } + + public Float getFundedamount() { + return fundedamount; + } + + public void setFundedamount(Float fundedamount) { + this.fundedamount = fundedamount; + } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + if (!Project.class.isAssignableFrom(e.getClass())) { + return; + } + + Project p = (Project) e; + + websiteurl = p.getWebsiteurl() != null && compareTrust(this, e) < 0 ? p.getWebsiteurl() : websiteurl; + code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; + acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; + title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; + startdate = p.getStartdate() != null && compareTrust(this, e) < 0 ? p.getStartdate() : startdate; + enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; + callidentifier = p.getCallidentifier() != null && compareTrust(this, e) < 0 + ? p.getCallidentifier() + : callidentifier; + keywords = p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; + duration = p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; + ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; + oamandatepublications = p.getOamandatepublications() != null && compareTrust(this, e) < 0 + ? p.getOamandatepublications() + : oamandatepublications; + ecarticle29_3 = p.getEcarticle29_3() != null && compareTrust(this, e) < 0 + ? p.getEcarticle29_3() + : ecarticle29_3; + subjects = mergeLists(subjects, p.getSubjects()); + fundingtree = mergeLists(fundingtree, p.getFundingtree()); + contracttype = p.getContracttype() != null && compareTrust(this, e) < 0 + ? p.getContracttype() + : contracttype; + optional1 = p.getOptional1() != null && compareTrust(this, e) < 0 ? p.getOptional1() : optional1; + optional2 = p.getOptional2() != null && compareTrust(this, e) < 0 ? p.getOptional2() : optional2; + jsonextrainfo = p.getJsonextrainfo() != null && compareTrust(this, e) < 0 + ? p.getJsonextrainfo() + : jsonextrainfo; + contactfullname = p.getContactfullname() != null && compareTrust(this, e) < 0 + ? p.getContactfullname() + : contactfullname; + contactfax = p.getContactfax() != null && compareTrust(this, e) < 0 ? p.getContactfax() : contactfax; + contactphone = p.getContactphone() != null && compareTrust(this, e) < 0 + ? p.getContactphone() + : contactphone; + contactemail = p.getContactemail() != null && compareTrust(this, e) < 0 + ? p.getContactemail() + : contactemail; + summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; + currency = p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; + totalcost = p.getTotalcost() != null && compareTrust(this, e) < 0 ? p.getTotalcost() : totalcost; + fundedamount = p.getFundedamount() != null && compareTrust(this, e) < 0 + ? p.getFundedamount() + : fundedamount; + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java index 9227df6eea..3058c262bc 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java @@ -1,36 +1,39 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Publication extends Result implements Serializable { - // publication specific - private Journal journal; + // publication specific + private Journal journal; - public Publication() { - setResulttype(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE); - } + public Publication() { + setResulttype(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE); + } - public Journal getJournal() { - return journal; - } + public Journal getJournal() { + return journal; + } - public void setJournal(Journal journal) { - this.journal = journal; - } + public void setJournal(Journal journal) { + this.journal = journal; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Publication.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Publication.class.isAssignableFrom(e.getClass())) { + return; + } - Publication p = (Publication) e; + Publication p = (Publication) e; - if (p.getJournal() != null && compareTrust(this, e) < 0) journal = p.getJournal(); - mergeOAFDataInfo(e); - } + if (p.getJournal() != null && compareTrust(this, e) < 0) + journal = p.getJournal(); + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 003d4a7a45..87ecb55f18 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -1,80 +1,87 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class Qualifier implements Serializable { - private String classid; - private String classname; - private String schemeid; - private String schemename; + private String classid; + private String classname; + private String schemeid; + private String schemename; - public String getClassid() { - return classid; - } + public String getClassid() { + return classid; + } - public void setClassid(String classid) { - this.classid = classid; - } + public void setClassid(String classid) { + this.classid = classid; + } - public String getClassname() { - return classname; - } + public String getClassname() { + return classname; + } - public void setClassname(String classname) { - this.classname = classname; - } + public void setClassname(String classname) { + this.classname = classname; + } - public String getSchemeid() { - return schemeid; - } + public String getSchemeid() { + return schemeid; + } - public void setSchemeid(String schemeid) { - this.schemeid = schemeid; - } + public void setSchemeid(String schemeid) { + this.schemeid = schemeid; + } - public String getSchemename() { - return schemename; - } + public String getSchemename() { + return schemename; + } - public void setSchemename(String schemename) { - this.schemename = schemename; - } + public void setSchemename(String schemename) { + this.schemename = schemename; + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s::%s::%s", - classid != null ? classid : "", - classname != null ? classname : "", - schemeid != null ? schemeid : "", - schemename != null ? schemename : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s::%s::%s", + classid != null ? classid : "", + classname != null ? classname : "", + schemeid != null ? schemeid : "", + schemename != null ? schemename : ""); + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(classid) - && StringUtils.isBlank(classname) - && StringUtils.isBlank(schemeid) - && StringUtils.isBlank(schemename); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(classid) + && StringUtils.isBlank(classname) + && StringUtils.isBlank(schemeid) + && StringUtils.isBlank(schemename); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Qualifier other = (Qualifier) obj; + Qualifier other = (Qualifier) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 197adfb81d..2c282c29e0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import static com.google.common.base.Preconditions.checkArgument; @@ -8,91 +9,96 @@ import java.util.stream.Stream; public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(final String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(final String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(final String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(final String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(final String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public void mergeFrom(final Relation r) { + public void mergeFrom(final Relation r) { - checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); - checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); - checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); - checkArgument( - Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); - checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); + checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); + checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); + checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); + checkArgument( + Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); + checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); - setCollectedfrom( - Stream.concat( - Optional.ofNullable(getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty()), - Optional.ofNullable(r.getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty())) - .distinct() // relies on KeyValue.equals - .collect(Collectors.toList())); - } + setCollectedfrom( + Stream + .concat( + Optional + .ofNullable(getCollectedfrom()) + .map(Collection::stream) + .orElse(Stream.empty()), + Optional + .ofNullable(r.getCollectedfrom()) + .map(Collection::stream) + .orElse(Stream.empty())) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Relation relation = (Relation) o; - return relType.equals(relation.relType) - && subRelType.equals(relation.subRelType) - && relClass.equals(relation.relClass) - && source.equals(relation.source) - && target.equals(relation.target); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Relation relation = (Relation) o; + return relType.equals(relation.relType) + && subRelType.equals(relation.subRelType) + && relClass.equals(relation.relClass) + && source.equals(relation.source) + && target.equals(relation.target); + } - @Override - public int hashCode() { - return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom); - } + @Override + public int hashCode() { + return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 5da50b9218..711b1ca681 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,286 +7,291 @@ import java.util.List; public class Result extends OafEntity implements Serializable { - private List author; + private List author; - // resulttype allows subclassing results into publications | datasets | software - private Qualifier resulttype; + // resulttype allows subclassing results into publications | datasets | software + private Qualifier resulttype; - // common fields - private Qualifier language; + // common fields + private Qualifier language; - private List country; + private List country; - private List subject; + private List subject; - private List title; + private List title; - private List relevantdate; + private List relevantdate; - private List> description; + private List> description; - private Field dateofacceptance; + private Field dateofacceptance; - private Field publisher; + private Field publisher; - private Field embargoenddate; + private Field embargoenddate; - private List> source; + private List> source; - private List> fulltext; // remove candidate + private List> fulltext; // remove candidate - private List> format; + private List> format; - private List> contributor; + private List> contributor; - private Qualifier resourcetype; + private Qualifier resourcetype; - private List> coverage; + private List> coverage; - private Qualifier bestaccessright; + private Qualifier bestaccessright; - private List context; + private List context; - private List externalReference; + private List externalReference; - private List instance; + private List instance; - public List getAuthor() { - return author; - } + public List getAuthor() { + return author; + } - public void setAuthor(List author) { - this.author = author; - } + public void setAuthor(List author) { + this.author = author; + } - public Qualifier getResulttype() { - return resulttype; - } + public Qualifier getResulttype() { + return resulttype; + } - public void setResulttype(Qualifier resulttype) { - this.resulttype = resulttype; - } + public void setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + } - public Qualifier getLanguage() { - return language; - } + public Qualifier getLanguage() { + return language; + } - public void setLanguage(Qualifier language) { - this.language = language; - } + public void setLanguage(Qualifier language) { + this.language = language; + } - public List getCountry() { - return country; - } + public List getCountry() { + return country; + } - public void setCountry(List country) { - this.country = country; - } + public void setCountry(List country) { + this.country = country; + } - public List getSubject() { - return subject; - } + public List getSubject() { + return subject; + } - public void setSubject(List subject) { - this.subject = subject; - } + public void setSubject(List subject) { + this.subject = subject; + } - public List getTitle() { - return title; - } + public List getTitle() { + return title; + } - public void setTitle(List title) { - this.title = title; - } + public void setTitle(List title) { + this.title = title; + } - public List getRelevantdate() { - return relevantdate; - } + public List getRelevantdate() { + return relevantdate; + } - public void setRelevantdate(List relevantdate) { - this.relevantdate = relevantdate; - } + public void setRelevantdate(List relevantdate) { + this.relevantdate = relevantdate; + } - public List> getDescription() { - return description; - } + public List> getDescription() { + return description; + } - public void setDescription(List> description) { - this.description = description; - } + public void setDescription(List> description) { + this.description = description; + } - public Field getDateofacceptance() { - return dateofacceptance; - } + public Field getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(Field dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public Field getPublisher() { - return publisher; - } + public Field getPublisher() { + return publisher; + } - public void setPublisher(Field publisher) { - this.publisher = publisher; - } + public void setPublisher(Field publisher) { + this.publisher = publisher; + } - public Field getEmbargoenddate() { - return embargoenddate; - } + public Field getEmbargoenddate() { + return embargoenddate; + } - public void setEmbargoenddate(Field embargoenddate) { - this.embargoenddate = embargoenddate; - } + public void setEmbargoenddate(Field embargoenddate) { + this.embargoenddate = embargoenddate; + } - public List> getSource() { - return source; - } + public List> getSource() { + return source; + } - public void setSource(List> source) { - this.source = source; - } + public void setSource(List> source) { + this.source = source; + } - public List> getFulltext() { - return fulltext; - } + public List> getFulltext() { + return fulltext; + } - public void setFulltext(List> fulltext) { - this.fulltext = fulltext; - } + public void setFulltext(List> fulltext) { + this.fulltext = fulltext; + } - public List> getFormat() { - return format; - } + public List> getFormat() { + return format; + } - public void setFormat(List> format) { - this.format = format; - } + public void setFormat(List> format) { + this.format = format; + } - public List> getContributor() { - return contributor; - } + public List> getContributor() { + return contributor; + } - public void setContributor(List> contributor) { - this.contributor = contributor; - } + public void setContributor(List> contributor) { + this.contributor = contributor; + } - public Qualifier getResourcetype() { - return resourcetype; - } + public Qualifier getResourcetype() { + return resourcetype; + } - public void setResourcetype(Qualifier resourcetype) { - this.resourcetype = resourcetype; - } + public void setResourcetype(Qualifier resourcetype) { + this.resourcetype = resourcetype; + } - public List> getCoverage() { - return coverage; - } + public List> getCoverage() { + return coverage; + } - public void setCoverage(List> coverage) { - this.coverage = coverage; - } + public void setCoverage(List> coverage) { + this.coverage = coverage; + } - public Qualifier getBestaccessright() { - return bestaccessright; - } + public Qualifier getBestaccessright() { + return bestaccessright; + } - public void setBestaccessright(Qualifier bestaccessright) { - this.bestaccessright = bestaccessright; - } + public void setBestaccessright(Qualifier bestaccessright) { + this.bestaccessright = bestaccessright; + } - public List getContext() { - return context; - } + public List getContext() { + return context; + } - public void setContext(List context) { - this.context = context; - } + public void setContext(List context) { + this.context = context; + } - public List getExternalReference() { - return externalReference; - } + public List getExternalReference() { + return externalReference; + } - public void setExternalReference(List externalReference) { - this.externalReference = externalReference; - } + public void setExternalReference(List externalReference) { + this.externalReference = externalReference; + } - public List getInstance() { - return instance; - } + public List getInstance() { + return instance; + } - public void setInstance(List instance) { - this.instance = instance; - } + public void setInstance(List instance) { + this.instance = instance; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Result.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Result.class.isAssignableFrom(e.getClass())) { + return; + } - Result r = (Result) e; + Result r = (Result) e; - instance = mergeLists(instance, r.getInstance()); + instance = mergeLists(instance, r.getInstance()); - if (r.getBestaccessright() != null && compareTrust(this, r) < 0) - bestaccessright = r.getBestaccessright(); + if (r.getBestaccessright() != null && compareTrust(this, r) < 0) + bestaccessright = r.getBestaccessright(); - if (r.getResulttype() != null && compareTrust(this, r) < 0) resulttype = r.getResulttype(); + if (r.getResulttype() != null && compareTrust(this, r) < 0) + resulttype = r.getResulttype(); - if (r.getLanguage() != null && compareTrust(this, r) < 0) language = r.getLanguage(); + if (r.getLanguage() != null && compareTrust(this, r) < 0) + language = r.getLanguage(); - country = mergeLists(country, r.getCountry()); + country = mergeLists(country, r.getCountry()); - subject = mergeLists(subject, r.getSubject()); + subject = mergeLists(subject, r.getSubject()); - title = mergeLists(title, r.getTitle()); + title = mergeLists(title, r.getTitle()); - relevantdate = mergeLists(relevantdate, r.getRelevantdate()); + relevantdate = mergeLists(relevantdate, r.getRelevantdate()); - description = longestLists(description, r.getDescription()); + description = longestLists(description, r.getDescription()); - if (r.getPublisher() != null && compareTrust(this, r) < 0) publisher = r.getPublisher(); + if (r.getPublisher() != null && compareTrust(this, r) < 0) + publisher = r.getPublisher(); - if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) - embargoenddate = r.getEmbargoenddate(); + if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) + embargoenddate = r.getEmbargoenddate(); - source = mergeLists(source, r.getSource()); + source = mergeLists(source, r.getSource()); - fulltext = mergeLists(fulltext, r.getFulltext()); + fulltext = mergeLists(fulltext, r.getFulltext()); - format = mergeLists(format, r.getFormat()); + format = mergeLists(format, r.getFormat()); - contributor = mergeLists(contributor, r.getContributor()); + contributor = mergeLists(contributor, r.getContributor()); - if (r.getResourcetype() != null) resourcetype = r.getResourcetype(); + if (r.getResourcetype() != null) + resourcetype = r.getResourcetype(); - coverage = mergeLists(coverage, r.getCoverage()); + coverage = mergeLists(coverage, r.getCoverage()); - context = mergeLists(context, r.getContext()); + context = mergeLists(context, r.getContext()); - externalReference = mergeLists(externalReference, r.getExternalReference()); - } + externalReference = mergeLists(externalReference, r.getExternalReference()); + } - private List> longestLists(List> a, List> b) { - if (a == null || b == null) return a == null ? b : a; - if (a.size() == b.size()) { - int msa = - a.stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - int msb = - b.stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - return msa > msb ? a : b; - } - return a.size() > b.size() ? a : b; - } + private List> longestLists(List> a, List> b) { + if (a == null || b == null) + return a == null ? b : a; + if (a.size() == b.size()) { + int msa = a + .stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); + int msb = b + .stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); + return msa > msb ? a : b; + } + return a.size() > b.size() ? a : b; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java index ffb7e03f76..40332bf53c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java @@ -1,78 +1,78 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Software extends Result implements Serializable { - private List> documentationUrl; + private List> documentationUrl; - private List license; + private List license; - private Field codeRepositoryUrl; + private Field codeRepositoryUrl; - private Qualifier programmingLanguage; + private Qualifier programmingLanguage; - public Software() { - setResulttype(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE); - } + public Software() { + setResulttype(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE); + } - public List> getDocumentationUrl() { - return documentationUrl; - } + public List> getDocumentationUrl() { + return documentationUrl; + } - public void setDocumentationUrl(List> documentationUrl) { - this.documentationUrl = documentationUrl; - } + public void setDocumentationUrl(List> documentationUrl) { + this.documentationUrl = documentationUrl; + } - public List getLicense() { - return license; - } + public List getLicense() { + return license; + } - public void setLicense(List license) { - this.license = license; - } + public void setLicense(List license) { + this.license = license; + } - public Field getCodeRepositoryUrl() { - return codeRepositoryUrl; - } + public Field getCodeRepositoryUrl() { + return codeRepositoryUrl; + } - public void setCodeRepositoryUrl(Field codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } + public void setCodeRepositoryUrl(Field codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + } - public Qualifier getProgrammingLanguage() { - return programmingLanguage; - } + public Qualifier getProgrammingLanguage() { + return programmingLanguage; + } - public void setProgrammingLanguage(Qualifier programmingLanguage) { - this.programmingLanguage = programmingLanguage; - } + public void setProgrammingLanguage(Qualifier programmingLanguage) { + this.programmingLanguage = programmingLanguage; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Software.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Software.class.isAssignableFrom(e.getClass())) { + return; + } - final Software s = (Software) e; - documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl()); + final Software s = (Software) e; + documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl()); - license = mergeLists(license, s.getLicense()); + license = mergeLists(license, s.getLicense()); - codeRepositoryUrl = - s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 - ? s.getCodeRepositoryUrl() - : codeRepositoryUrl; + codeRepositoryUrl = s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 + ? s.getCodeRepositoryUrl() + : codeRepositoryUrl; - programmingLanguage = - s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 - ? s.getProgrammingLanguage() - : programmingLanguage; + programmingLanguage = s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 + ? s.getProgrammingLanguage() + : programmingLanguage; - mergeOAFDataInfo(e); - } + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java index 2e77389a3d..1fa0de0be8 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java @@ -1,56 +1,60 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; public class StructuredProperty implements Serializable { - private String value; + private String value; - private Qualifier qualifier; + private Qualifier qualifier; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - public Qualifier getQualifier() { - return qualifier; - } + public Qualifier getQualifier() { + return qualifier; + } - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } + public void setQualifier(Qualifier qualifier) { + this.qualifier = qualifier; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public String toComparableString() { - return value != null ? value.toLowerCase() : ""; - } + public String toComparableString() { + return value != null ? value.toLowerCase() : ""; + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - StructuredProperty other = (StructuredProperty) obj; + StructuredProperty other = (StructuredProperty) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java index e1569787b9..421b4ecaac 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java @@ -1,83 +1,89 @@ + package eu.dnetlib.dhp.schema.scholexplorer; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.OafEntity; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + public class DLIDataset extends Dataset { - private String originalObjIdentifier; + private String originalObjIdentifier; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus; + private String completionStatus; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } + public String getOriginalObjIdentifier() { + return originalObjIdentifier; + } - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } + public void setOriginalObjIdentifier(String originalObjIdentifier) { + this.originalObjIdentifier = originalObjIdentifier; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIDataset p = (DLIDataset) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + DLIDataset p = (DLIDataset) e; + if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) + completionStatus = p.completionStatus; + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java index 2cfb6515cc..c899a899c6 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java @@ -1,81 +1,87 @@ + package eu.dnetlib.dhp.schema.scholexplorer; +import java.io.Serializable; +import java.util.*; + +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Publication; -import java.io.Serializable; -import java.util.*; -import org.apache.commons.lang3.StringUtils; public class DLIPublication extends Publication implements Serializable { - private String originalObjIdentifier; + private String originalObjIdentifier; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus; + private String completionStatus; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } + public String getOriginalObjIdentifier() { + return originalObjIdentifier; + } - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } + public void setOriginalObjIdentifier(String originalObjIdentifier) { + this.originalObjIdentifier = originalObjIdentifier; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIPublication p = (DLIPublication) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + DLIPublication p = (DLIPublication) e; + if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) + completionStatus = p.completionStatus; + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java index b58483cbba..d2d2089c08 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java @@ -1,15 +1,16 @@ + package eu.dnetlib.dhp.schema.scholexplorer; import eu.dnetlib.dhp.schema.oaf.Relation; public class DLIRelation extends Relation { - private String dateOfCollection; + private String dateOfCollection; - public String getDateOfCollection() { - return dateOfCollection; - } + public String getDateOfCollection() { + return dateOfCollection; + } - public void setDateOfCollection(String dateOfCollection) { - this.dateOfCollection = dateOfCollection; - } + public void setDateOfCollection(String dateOfCollection) { + this.dateOfCollection = dateOfCollection; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java index 6a58ab54ff..e9b670d032 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java @@ -1,109 +1,115 @@ + package eu.dnetlib.dhp.schema.scholexplorer; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + public class DLIUnknown extends Oaf implements Serializable { - private String id; + private String id; - private List pid; + private List pid; - private String dateofcollection; + private String dateofcollection; - private String dateoftransformation; + private String dateoftransformation; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus = "incomplete"; + private String completionStatus = "incomplete"; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getDateofcollection() { - return dateofcollection; - } + public String getDateofcollection() { + return dateofcollection; + } - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } + public void setDateofcollection(String dateofcollection) { + this.dateofcollection = dateofcollection; + } - public String getDateoftransformation() { - return dateoftransformation; - } + public String getDateoftransformation() { + return dateoftransformation; + } - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } + public void setDateoftransformation(String dateoftransformation) { + this.dateoftransformation = dateoftransformation; + } - public void mergeFrom(DLIUnknown p) { - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + public void mergeFrom(DLIUnknown p) { + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java index 52f7161b94..b1188f0648 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java @@ -1,46 +1,47 @@ + package eu.dnetlib.dhp.schema.scholexplorer; import java.io.Serializable; public class ProvenaceInfo implements Serializable { - private String id; + private String id; - private String name; + private String name; - private String completionStatus; + private String completionStatus; - private String collectionMode = "collected"; + private String collectionMode = "collected"; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public String getCollectionMode() { - return collectionMode; - } + public String getCollectionMode() { + return collectionMode; + } - public void setCollectionMode(String collectionMode) { - this.collectionMode = collectionMode; - } + public void setCollectionMode(String collectionMode) { + this.collectionMode = collectionMode; + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java index 482c1c223c..4d31591a09 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java @@ -1,36 +1,40 @@ + package eu.dnetlib.dhp.schema.action; import static org.junit.jupiter.api.Assertions.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.IOException; + import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + /** @author claudio.atzori */ public class AtomicActionTest { - @Test - public void serializationTest() throws IOException { + @Test + public void serializationTest() throws IOException { - Relation rel = new Relation(); - rel.setSource("1"); - rel.setTarget("2"); - rel.setRelType("resultResult"); - rel.setSubRelType("dedup"); - rel.setRelClass("merges"); + Relation rel = new Relation(); + rel.setSource("1"); + rel.setTarget("2"); + rel.setRelType("resultResult"); + rel.setSubRelType("dedup"); + rel.setRelClass("merges"); - AtomicAction aa1 = new AtomicAction(Relation.class, rel); + AtomicAction aa1 = new AtomicAction(Relation.class, rel); - final ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(aa1); + final ObjectMapper mapper = new ObjectMapper(); + String json = mapper.writeValueAsString(aa1); - assertTrue(StringUtils.isNotBlank(json)); + assertTrue(StringUtils.isNotBlank(json)); - AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); + AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); - assertEquals(aa1.getClazz(), aa2.getClazz()); - assertEquals(aa1.getPayload(), aa2.getPayload()); - } + assertEquals(aa1.getClazz(), aa2.getClazz()); + assertEquals(aa1.getPayload(), aa2.getPayload()); + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java index 3e07ea87c5..73e8c47ffd 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java @@ -1,35 +1,37 @@ + package eu.dnetlib.dhp.schema.common; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + public class ModelSupportTest { - @Nested - class IsSubClass { + @Nested + class IsSubClass { - @Test - public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class); + @Test + public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class); - // then - assertFalse(result); - } + // then + assertFalse(result); + } - @Test - public void shouldReturnTrueWhenSubClassExtendsSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class); + @Test + public void shouldReturnTrueWhenSubClassExtendsSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class); - // then - assertTrue(result); - } - } + // then + assertTrue(result); + } + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index bb5b824f69..f91646f2c5 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -1,86 +1,88 @@ + package eu.dnetlib.dhp.schema.oaf; import static org.junit.jupiter.api.Assertions.*; import java.util.Arrays; import java.util.List; + import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; public class MergeTest { - OafEntity oaf; + OafEntity oaf; - @BeforeEach - public void setUp() { - oaf = new Publication(); - } + @BeforeEach + public void setUp() { + oaf = new Publication(); + } - @Test - public void mergeListsTest() { + @Test + public void mergeListsTest() { - // string list merge test - List a = Arrays.asList("a", "b", "c", "e"); - List b = Arrays.asList("a", "b", "c", "d"); - List c = null; + // string list merge test + List a = Arrays.asList("a", "b", "c", "e"); + List b = Arrays.asList("a", "b", "c", "d"); + List c = null; - System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); + System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); - System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); + System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); - System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); - } + System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); + } - @Test - public void mergePublicationCollectedFromTest() { + @Test + public void mergePublicationCollectedFromTest() { - Publication a = new Publication(); - Publication b = new Publication(); + Publication a = new Publication(); + Publication b = new Publication(); - a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); - b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); + a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); + b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); - a.mergeFrom(b); + a.mergeFrom(b); - assertNotNull(a.getCollectedfrom()); - assertEquals(3, a.getCollectedfrom().size()); - } + assertNotNull(a.getCollectedfrom()); + assertEquals(3, a.getCollectedfrom().size()); + } - @Test - public void mergePublicationSubjectTest() { + @Test + public void mergePublicationSubjectTest() { - Publication a = new Publication(); - Publication b = new Publication(); + Publication a = new Publication(); + Publication b = new Publication(); - a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); - b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); + a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); + b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); - a.mergeFrom(b); + a.mergeFrom(b); - assertNotNull(a.getSubject()); - assertEquals(3, a.getSubject().size()); - } + assertNotNull(a.getSubject()); + assertEquals(3, a.getSubject().size()); + } - private KeyValue setKV(final String key, final String value) { + private KeyValue setKV(final String key, final String value) { - KeyValue k = new KeyValue(); + KeyValue k = new KeyValue(); - k.setKey(key); - k.setValue(value); + k.setKey(key); + k.setValue(value); - return k; - } + return k; + } - private StructuredProperty setSP( - final String value, final String schema, final String classname) { - StructuredProperty s = new StructuredProperty(); - s.setValue(value); - Qualifier q = new Qualifier(); - q.setClassname(classname); - q.setClassid(classname); - q.setSchemename(schema); - q.setSchemeid(schema); - s.setQualifier(q); - return s; - } + private StructuredProperty setSP( + final String value, final String schema, final String classname) { + StructuredProperty s = new StructuredProperty(); + s.setValue(value); + Qualifier q = new Qualifier(); + q.setClassname(classname); + q.setClassid(classname); + q.setSchemename(schema); + q.setSchemeid(schema); + s.setQualifier(q); + return s; + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java index 4f82cfe101..e4596fcddb 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java @@ -1,76 +1,83 @@ + package eu.dnetlib.dhp.schema.scholexplorer; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; + import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import org.junit.jupiter.api.Test; public class DLItest { - @Test - public void testMergePublication() throws JsonProcessingException { - DLIPublication a1 = new DLIPublication(); - a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); - a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); - a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); - a1.setCompletionStatus("complete"); + @Test + public void testMergePublication() throws JsonProcessingException { + DLIPublication a1 = new DLIPublication(); + a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); + a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); + a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); + a1.setCompletionStatus("complete"); - DLIPublication a = new DLIPublication(); - a.setPid( - Arrays.asList( - createSP("10.11", "doi", "dnet:pid_types"), - createSP("123456", "pdb", "dnet:pid_types"))); - a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); - a.setDlicollectedfrom( - Arrays.asList( - createCollectedFrom("dct", "datacite", "complete"), - createCollectedFrom("dct", "datacite", "incomplete"))); - a.setCompletionStatus("incomplete"); + DLIPublication a = new DLIPublication(); + a + .setPid( + Arrays + .asList( + createSP("10.11", "doi", "dnet:pid_types"), + createSP("123456", "pdb", "dnet:pid_types"))); + a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); + a + .setDlicollectedfrom( + Arrays + .asList( + createCollectedFrom("dct", "datacite", "complete"), + createCollectedFrom("dct", "datacite", "incomplete"))); + a.setCompletionStatus("incomplete"); - a.mergeFrom(a1); + a.mergeFrom(a1); - ObjectMapper mapper = new ObjectMapper(); - System.out.println(mapper.writeValueAsString(a)); - } + ObjectMapper mapper = new ObjectMapper(); + System.out.println(mapper.writeValueAsString(a)); + } - @Test - public void testDeserialization() throws IOException { + @Test + public void testDeserialization() throws IOException { - final String json = - "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; + final String json = "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class); - mapper.enable(SerializationFeature.INDENT_OUTPUT); - System.out.println(mapper.writeValueAsString(dliDataset)); - } + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class); + mapper.enable(SerializationFeature.INDENT_OUTPUT); + System.out.println(mapper.writeValueAsString(dliDataset)); + } - private ProvenaceInfo createCollectedFrom( - final String id, final String name, final String completionStatus) { - ProvenaceInfo p = new ProvenaceInfo(); - p.setId(id); - p.setName(name); - p.setCompletionStatus(completionStatus); - return p; - } + private ProvenaceInfo createCollectedFrom( + final String id, final String name, final String completionStatus) { + ProvenaceInfo p = new ProvenaceInfo(); + p.setId(id); + p.setName(name); + p.setCompletionStatus(completionStatus); + return p; + } - private StructuredProperty createSP( - final String value, final String className, final String schemeName) { - StructuredProperty p = new StructuredProperty(); - p.setValue(value); - Qualifier schema = new Qualifier(); - schema.setClassname(className); - schema.setClassid(className); - schema.setSchemename(schemeName); - schema.setSchemeid(schemeName); - p.setQualifier(schema); - return p; - } + private StructuredProperty createSP( + final String value, final String className, final String schemeName) { + StructuredProperty p = new StructuredProperty(); + p.setValue(value); + Qualifier schema = new Qualifier(); + schema.setClassname(className); + schema.setClassid(className); + schema.setSchemename(schemeName); + schema.setSchemeid(schemeName); + p.setQualifier(schema); + return p; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java index 42ca86f5fc..0914381954 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java @@ -1,8 +1,23 @@ + package eu.dnetlib.dhp.actionmanager; +import java.io.Serializable; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.stream.Collectors; + +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; + import eu.dnetlib.actionmanager.rmi.ActionManagerException; import eu.dnetlib.actionmanager.set.ActionManagerSet; import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes; @@ -10,130 +25,120 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import java.io.Serializable; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.stream.Collectors; -import org.dom4j.Document; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class ISClient implements Serializable { - private static final Logger log = - LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; + private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; - private ISLookUpService isLookup; + private ISLookUpService isLookup; - public ISClient(String isLookupUrl) { - isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); - } + public ISClient(String isLookupUrl) { + isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + } - public List getLatestRawsetPaths(String setIds) { + public List getLatestRawsetPaths(String setIds) { - List ids = - Lists.newArrayList( - Splitter.on(INPUT_ACTION_SET_ID_SEPARATOR) - .omitEmptyStrings() - .trimResults() - .split(setIds)); + List ids = Lists + .newArrayList( + Splitter + .on(INPUT_ACTION_SET_ID_SEPARATOR) + .omitEmptyStrings() + .trimResults() + .split(setIds)); - return ids.stream() - .map(id -> getSet(isLookup, id)) - .map(as -> as.getPathToLatest()) - .collect(Collectors.toCollection(ArrayList::new)); - } + return ids + .stream() + .map(id -> getSet(isLookup, id)) + .map(as -> as.getPathToLatest()) + .collect(Collectors.toCollection(ArrayList::new)); + } - private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { + private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { - final String q = - "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " - + "where $x//SET/@id = '" - + setId - + "' return $x"; + final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " + + "where $x//SET/@id = '" + + setId + + "' return $x"; - try { - final String basePath = getBasePathHDFS(isLookup); - final String setProfile = isLookup.getResourceProfileByQuery(q); - return getActionManagerSet(basePath, setProfile); - } catch (ISLookUpException | ActionManagerException e) { - throw new RuntimeException("Error accessing Sets, using query: " + q); - } - } + try { + final String basePath = getBasePathHDFS(isLookup); + final String setProfile = isLookup.getResourceProfileByQuery(q); + return getActionManagerSet(basePath, setProfile); + } catch (ISLookUpException | ActionManagerException e) { + throw new RuntimeException("Error accessing Sets, using query: " + q); + } + } - private ActionManagerSet getActionManagerSet(final String basePath, final String profile) - throws ActionManagerException { - final SAXReader reader = new SAXReader(); - final ActionManagerSet set = new ActionManagerSet(); + private ActionManagerSet getActionManagerSet(final String basePath, final String profile) + throws ActionManagerException { + final SAXReader reader = new SAXReader(); + final ActionManagerSet set = new ActionManagerSet(); - try { - final Document doc = reader.read(new StringReader(profile)); + try { + final Document doc = reader.read(new StringReader(profile)); - set.setId(doc.valueOf("//SET/@id").trim()); - set.setName(doc.valueOf("//SET").trim()); - set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); - set.setLatest( - doc.valueOf("//RAW_SETS/LATEST/@id"), - doc.valueOf("//RAW_SETS/LATEST/@creationDate"), - doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); - set.setDirectory(doc.valueOf("//SET/@directory")); - final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); - if (expiredNodes != null) { - for (int i = 0; i < expiredNodes.size(); i++) { - Element ex = (Element) expiredNodes.get(i); - set.addExpired( - ex.attributeValue("id"), - ex.attributeValue("creationDate"), - ex.attributeValue("lastUpdate")); - } - } + set.setId(doc.valueOf("//SET/@id").trim()); + set.setName(doc.valueOf("//SET").trim()); + set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); + set + .setLatest( + doc.valueOf("//RAW_SETS/LATEST/@id"), + doc.valueOf("//RAW_SETS/LATEST/@creationDate"), + doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); + set.setDirectory(doc.valueOf("//SET/@directory")); + final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); + if (expiredNodes != null) { + for (int i = 0; i < expiredNodes.size(); i++) { + Element ex = (Element) expiredNodes.get(i); + set + .addExpired( + ex.attributeValue("id"), + ex.attributeValue("creationDate"), + ex.attributeValue("lastUpdate")); + } + } - final StringBuilder sb = new StringBuilder(); - sb.append(basePath); - sb.append("/"); - sb.append(doc.valueOf("//SET/@directory")); - sb.append("/"); - sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); - set.setPathToLatest(sb.toString()); + final StringBuilder sb = new StringBuilder(); + sb.append(basePath); + sb.append("/"); + sb.append(doc.valueOf("//SET/@directory")); + sb.append("/"); + sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); + set.setPathToLatest(sb.toString()); - return set; - } catch (Exception e) { - throw new ActionManagerException("Error creating set from profile: " + profile, e); - } - } + return set; + } catch (Exception e) { + throw new ActionManagerException("Error creating set from profile: " + profile, e); + } + } - private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { - return queryServiceProperty(isLookup, "basePath"); - } + private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { + return queryServiceProperty(isLookup, "basePath"); + } - private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) - throws ActionManagerException { - final String q = - "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" - + propertyName - + "']/@value/string()"; - log.debug("quering for service property: " + q); - try { - final List value = isLookup.quickSearchProfile(q); - return Iterables.getOnlyElement(value); - } catch (ISLookUpException e) { - String msg = "Error accessing service profile, using query: " + q; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (NoSuchElementException e) { - String msg = "missing service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (IllegalArgumentException e) { - String msg = "found more than one service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } - } + private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) + throws ActionManagerException { + final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" + + propertyName + + "']/@value/string()"; + log.debug("quering for service property: " + q); + try { + final List value = isLookup.quickSearchProfile(q); + return Iterables.getOnlyElement(value); + } catch (ISLookUpException e) { + String msg = "Error accessing service profile, using query: " + q; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (NoSuchElementException e) { + String msg = "missing service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (IllegalArgumentException e) { + String msg = "found more than one service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java index ae498c4118..7b6046f8b1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java @@ -1,47 +1,69 @@ + package eu.dnetlib.dhp.actionmanager.migration; -import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; import java.util.Comparator; +import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; + public class LicenseComparator implements Comparator { - @Override - public int compare(Qualifier left, Qualifier right) { + @Override + public int compare(Qualifier left, Qualifier right) { - if (left == null && right == null) return 0; - if (left == null) return 1; - if (right == null) return -1; + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; - String lClass = left.getClassid(); - String rClass = right.getClassid(); + String lClass = left.getClassid(); + String rClass = right.getClassid(); - if (lClass.equals(rClass)) return 0; + if (lClass.equals(rClass)) + return 0; - if (lClass.equals("OPEN SOURCE")) return -1; - if (rClass.equals("OPEN SOURCE")) return 1; + if (lClass.equals("OPEN SOURCE")) + return -1; + if (rClass.equals("OPEN SOURCE")) + return 1; - if (lClass.equals("OPEN")) return -1; - if (rClass.equals("OPEN")) return 1; + if (lClass.equals("OPEN")) + return -1; + if (rClass.equals("OPEN")) + return 1; - if (lClass.equals("6MONTHS")) return -1; - if (rClass.equals("6MONTHS")) return 1; + if (lClass.equals("6MONTHS")) + return -1; + if (rClass.equals("6MONTHS")) + return 1; - if (lClass.equals("12MONTHS")) return -1; - if (rClass.equals("12MONTHS")) return 1; + if (lClass.equals("12MONTHS")) + return -1; + if (rClass.equals("12MONTHS")) + return 1; - if (lClass.equals("EMBARGO")) return -1; - if (rClass.equals("EMBARGO")) return 1; + if (lClass.equals("EMBARGO")) + return -1; + if (rClass.equals("EMBARGO")) + return 1; - if (lClass.equals("RESTRICTED")) return -1; - if (rClass.equals("RESTRICTED")) return 1; + if (lClass.equals("RESTRICTED")) + return -1; + if (rClass.equals("RESTRICTED")) + return 1; - if (lClass.equals("CLOSED")) return -1; - if (rClass.equals("CLOSED")) return 1; + if (lClass.equals("CLOSED")) + return -1; + if (rClass.equals("CLOSED")) + return 1; - if (lClass.equals("UNKNOWN")) return -1; - if (rClass.equals("UNKNOWN")) return 1; + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java index 43ad7c5e36..89cb63fabf 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.actionmanager.migration; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; @@ -14,6 +9,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -25,164 +21,174 @@ import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class MigrateActionSet { - private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); + private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); - private static final String SEPARATOR = "/"; - private static final String TARGET_PATHS = "target_paths"; - private static final String RAWSET_PREFIX = "rawset_"; + private static final String SEPARATOR = "/"; + private static final String TARGET_PATHS = "target_paths"; + private static final String RAWSET_PREFIX = "rawset_"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateActionSet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json"))); + parser.parseArgument(args); - new MigrateActionSet().run(parser); - } + new MigrateActionSet().run(parser); + } - private void run(ArgumentApplicationParser parser) throws Exception { + private void run(ArgumentApplicationParser parser) throws Exception { - final String isLookupUrl = parser.get("isLookupUrl"); - final String sourceNN = parser.get("sourceNameNode"); - final String targetNN = parser.get("targetNameNode"); - final String workDir = parser.get("workingDirectory"); - final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); + final String isLookupUrl = parser.get("isLookupUrl"); + final String sourceNN = parser.get("sourceNameNode"); + final String targetNN = parser.get("targetNameNode"); + final String workDir = parser.get("workingDirectory"); + final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); - final String distcp_memory_mb = parser.get("distcp_memory_mb"); - final String distcp_task_timeout = parser.get("distcp_task_timeout"); + final String distcp_memory_mb = parser.get("distcp_memory_mb"); + final String distcp_task_timeout = parser.get("distcp_task_timeout"); - final String transform_only_s = parser.get("transform_only"); + final String transform_only_s = parser.get("transform_only"); - log.info("transform only param: {}", transform_only_s); + log.info("transform only param: {}", transform_only_s); - final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); + final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); - log.info("transform only: {}", transformOnly); + log.info("transform only: {}", transformOnly); - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - FileSystem targetFS = FileSystem.get(conf); + Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + FileSystem targetFS = FileSystem.get(conf); - Configuration sourceConf = - getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); - FileSystem sourceFS = FileSystem.get(sourceConf); + Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); + FileSystem sourceFS = FileSystem.get(sourceConf); - Properties props = new Properties(); + Properties props = new Properties(); - List targetPaths = new ArrayList<>(); + List targetPaths = new ArrayList<>(); - final List sourcePaths = getSourcePaths(sourceNN, isLookUp); - log.info( - "paths to process:\n{}", - sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))); - for (Path source : sourcePaths) { + final List sourcePaths = getSourcePaths(sourceNN, isLookUp); + log + .info( + "paths to process:\n{}", + sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))); + for (Path source : sourcePaths) { - if (!sourceFS.exists(source)) { - log.warn("skipping unexisting path: {}", source); - } else { + if (!sourceFS.exists(source)) { + log.warn("skipping unexisting path: {}", source); + } else { - LinkedList pathQ = - Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); - final String rawSet = pathQ.pollLast(); - log.info("got RAWSET: {}", rawSet); + final String rawSet = pathQ.pollLast(); + log.info("got RAWSET: {}", rawSet); - if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { + if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { - final String actionSetDirectory = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); - final Path targetPath = - new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); + final Path targetPath = new Path( + targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); - log.info("using TARGET PATH: {}", targetPath); + log.info("using TARGET PATH: {}", targetPath); - if (!transformOnly) { - if (targetFS.exists(targetPath)) { - targetFS.delete(targetPath, true); - } - runDistcp( - distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); - } + if (!transformOnly) { + if (targetFS.exists(targetPath)) { + targetFS.delete(targetPath, true); + } + runDistcp( + distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); + } - targetPaths.add(targetPath); - } - } - } + targetPaths.add(targetPath); + } + } + } - props.setProperty( - TARGET_PATHS, targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","))); - File file = new File(System.getProperty("oozie.action.output.properties")); + props + .setProperty( + TARGET_PATHS, targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","))); + File file = new File(System.getProperty("oozie.action.output.properties")); - try (OutputStream os = new FileOutputStream(file)) { - props.store(os, ""); - } - System.out.println(file.getAbsolutePath()); - } + try (OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + System.out.println(file.getAbsolutePath()); + } - private void runDistcp( - Integer distcp_num_maps, - String distcp_memory_mb, - String distcp_task_timeout, - Configuration conf, - Path source, - Path targetPath) - throws Exception { + private void runDistcp( + Integer distcp_num_maps, + String distcp_memory_mb, + String distcp_task_timeout, + Configuration conf, + Path source, + Path targetPath) + throws Exception { - final DistCpOptions op = new DistCpOptions(source, targetPath); - op.setMaxMaps(distcp_num_maps); - op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); - op.preserve(DistCpOptions.FileAttribute.REPLICATION); - op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); + final DistCpOptions op = new DistCpOptions(source, targetPath); + op.setMaxMaps(distcp_num_maps); + op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); + op.preserve(DistCpOptions.FileAttribute.REPLICATION); + op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); - int res = - ToolRunner.run( - new DistCp(conf, op), - new String[] { - "-Dmapred.task.timeout=" + distcp_task_timeout, - "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, - "-pb", - "-m " + distcp_num_maps, - source.toString(), - targetPath.toString() - }); + int res = ToolRunner + .run( + new DistCp(conf, op), + new String[] { + "-Dmapred.task.timeout=" + distcp_task_timeout, + "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, + "-pb", + "-m " + distcp_num_maps, + source.toString(), + targetPath.toString() + }); - if (res != 0) { - throw new RuntimeException(String.format("distcp exited with code %s", res)); - } - } + if (res != 0) { + throw new RuntimeException(String.format("distcp exited with code %s", res)); + } + } - private Configuration getConfiguration( - String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { - final Configuration conf = new Configuration(); - conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); - conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); - conf.set("dfs.http.client.retry.policy.enabled", "true"); - conf.set("mapred.task.timeout", distcp_task_timeout); - conf.set("mapreduce.map.memory.mb", distcp_memory_mb); - conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); - return conf; - } + private Configuration getConfiguration( + String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { + final Configuration conf = new Configuration(); + conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); + conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); + conf.set("dfs.http.client.retry.policy.enabled", "true"); + conf.set("mapred.task.timeout", distcp_task_timeout); + conf.set("mapreduce.map.memory.mb", distcp_memory_mb); + conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); + return conf; + } - private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) - throws ISLookUpException { - String XQUERY = - "distinct-values(\n" - + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" - + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" - + "let $setDir := $x//SET/@directory/string()\n" - + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" - + "return concat($basePath, '/', $setDir, '/', $rawSet))"; + private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) + throws ISLookUpException { + String XQUERY = "distinct-values(\n" + + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + + "let $setDir := $x//SET/@directory/string()\n" + + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + + "return concat($basePath, '/', $setDir, '/', $rawSet))"; - log.info(String.format("running xquery:\n%s", XQUERY)); - return isLookUp.quickSearchProfile(XQUERY).stream() - .map(p -> sourceNN + p) - .map(Path::new) - .collect(Collectors.toList()); - } + log.info(String.format("running xquery:\n%s", XQUERY)); + return isLookUp + .quickSearchProfile(XQUERY) + .stream() + .map(p -> sourceNN + p) + .map(Path::new) + .collect(Collectors.toList()); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java index 894804e25a..456113c438 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.migration; import static eu.dnetlib.data.proto.KindProtos.Kind.entity; @@ -5,569 +6,659 @@ import static eu.dnetlib.data.proto.KindProtos.Kind.relation; import static eu.dnetlib.data.proto.TypeProtos.*; import static eu.dnetlib.data.proto.TypeProtos.Type.*; -import com.google.common.collect.Lists; -import com.googlecode.protobuf.format.JsonFormat; -import eu.dnetlib.data.proto.*; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.Serializable; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import com.google.common.collect.Lists; +import com.googlecode.protobuf.format.JsonFormat; + +import eu.dnetlib.data.proto.*; +import eu.dnetlib.dhp.schema.oaf.*; + public class ProtoConverter implements Serializable { - public static final String UNKNOWN = "UNKNOWN"; - public static final String NOT_AVAILABLE = "not available"; - public static final String DNET_ACCESS_MODES = "dnet:access_modes"; + public static final String UNKNOWN = "UNKNOWN"; + public static final String NOT_AVAILABLE = "not available"; + public static final String DNET_ACCESS_MODES = "dnet:access_modes"; - public static Oaf convert(OafProtos.Oaf oaf) { - try { - switch (oaf.getKind()) { - case entity: - return convertEntity(oaf); - case relation: - return convertRelation(oaf); - default: - throw new IllegalArgumentException("invalid kind " + oaf.getKind()); - } - } catch (Throwable e) { - throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); - } - } + public static Oaf convert(OafProtos.Oaf oaf) { + try { + switch (oaf.getKind()) { + case entity: + return convertEntity(oaf); + case relation: + return convertRelation(oaf); + default: + throw new IllegalArgumentException("invalid kind " + oaf.getKind()); + } + } catch (Throwable e) { + throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); + } + } - private static Relation convertRelation(OafProtos.Oaf oaf) { - final OafProtos.OafRel r = oaf.getRel(); - final Relation rel = new Relation(); - rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); - rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); - rel.setSource(r.getSource()); - rel.setTarget(r.getTarget()); - rel.setRelType(r.getRelType().toString()); - rel.setSubRelType(r.getSubRelType().toString()); - rel.setRelClass(r.getRelClass()); - rel.setCollectedfrom( - r.getCollectedfromCount() > 0 - ? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList()) - : null); - return rel; - } + private static Relation convertRelation(OafProtos.Oaf oaf) { + final OafProtos.OafRel r = oaf.getRel(); + final Relation rel = new Relation(); + rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); + rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); + rel.setSource(r.getSource()); + rel.setTarget(r.getTarget()); + rel.setRelType(r.getRelType().toString()); + rel.setSubRelType(r.getSubRelType().toString()); + rel.setRelClass(r.getRelClass()); + rel + .setCollectedfrom( + r.getCollectedfromCount() > 0 + ? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList()) + : null); + return rel; + } - private static OafEntity convertEntity(OafProtos.Oaf oaf) { + private static OafEntity convertEntity(OafProtos.Oaf oaf) { - switch (oaf.getEntity().getType()) { - case result: - final Result r = convertResult(oaf); - r.setInstance(convertInstances(oaf)); - return r; - case project: - return convertProject(oaf); - case datasource: - return convertDataSource(oaf); - case organization: - return convertOrganization(oaf); - default: - throw new RuntimeException("received unknown type"); - } - } + switch (oaf.getEntity().getType()) { + case result: + final Result r = convertResult(oaf); + r.setInstance(convertInstances(oaf)); + return r; + case project: + return convertProject(oaf); + case datasource: + return convertDataSource(oaf); + case organization: + return convertOrganization(oaf); + default: + throw new RuntimeException("received unknown type"); + } + } - private static List convertInstances(OafProtos.Oaf oaf) { + private static List convertInstances(OafProtos.Oaf oaf) { - final ResultProtos.Result r = oaf.getEntity().getResult(); - if (r.getInstanceCount() > 0) { - return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList()); - } - return Lists.newArrayList(); - } + final ResultProtos.Result r = oaf.getEntity().getResult(); + if (r.getInstanceCount() > 0) { + return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList()); + } + return Lists.newArrayList(); + } - private static Instance convertInstance(ResultProtos.Result.Instance ri) { - final Instance i = new Instance(); - i.setAccessright(mapQualifier(ri.getAccessright())); - i.setCollectedfrom(mapKV(ri.getCollectedfrom())); - i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); - i.setDistributionlocation(ri.getDistributionlocation()); - i.setHostedby(mapKV(ri.getHostedby())); - i.setInstancetype(mapQualifier(ri.getInstancetype())); - i.setLicense(mapStringField(ri.getLicense())); - i.setUrl(ri.getUrlList()); - i.setRefereed(mapStringField(ri.getRefereed())); - i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); - i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); - return i; - } + private static Instance convertInstance(ResultProtos.Result.Instance ri) { + final Instance i = new Instance(); + i.setAccessright(mapQualifier(ri.getAccessright())); + i.setCollectedfrom(mapKV(ri.getCollectedfrom())); + i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); + i.setDistributionlocation(ri.getDistributionlocation()); + i.setHostedby(mapKV(ri.getHostedby())); + i.setInstancetype(mapQualifier(ri.getInstancetype())); + i.setLicense(mapStringField(ri.getLicense())); + i.setUrl(ri.getUrlList()); + i.setRefereed(mapStringField(ri.getRefereed())); + i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); + i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); + return i; + } - private static Organization convertOrganization(OafProtos.Oaf oaf) { - final OrganizationProtos.Organization.Metadata m = - oaf.getEntity().getOrganization().getMetadata(); - final Organization org = setOaf(new Organization(), oaf); - setEntity(org, oaf); - org.setLegalshortname(mapStringField(m.getLegalshortname())); - org.setLegalname(mapStringField(m.getLegalname())); - org.setAlternativeNames( - m.getAlternativeNamesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - org.setWebsiteurl(mapStringField(m.getWebsiteurl())); - org.setLogourl(mapStringField(m.getLogourl())); - org.setEclegalbody(mapStringField(m.getEclegalbody())); - org.setEclegalperson(mapStringField(m.getEclegalperson())); - org.setEcnonprofit(mapStringField(m.getEcnonprofit())); - org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); - org.setEchighereducation(mapStringField(m.getEchighereducation())); - org.setEcinternationalorganizationeurinterests( - mapStringField(m.getEcinternationalorganizationeurinterests())); - org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); - org.setEcenterprise(mapStringField(m.getEcenterprise())); - org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); - org.setEcnutscode(mapStringField(m.getEcnutscode())); - org.setCountry(mapQualifier(m.getCountry())); + private static Organization convertOrganization(OafProtos.Oaf oaf) { + final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); + final Organization org = setOaf(new Organization(), oaf); + setEntity(org, oaf); + org.setLegalshortname(mapStringField(m.getLegalshortname())); + org.setLegalname(mapStringField(m.getLegalname())); + org + .setAlternativeNames( + m + .getAlternativeNamesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + org.setWebsiteurl(mapStringField(m.getWebsiteurl())); + org.setLogourl(mapStringField(m.getLogourl())); + org.setEclegalbody(mapStringField(m.getEclegalbody())); + org.setEclegalperson(mapStringField(m.getEclegalperson())); + org.setEcnonprofit(mapStringField(m.getEcnonprofit())); + org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); + org.setEchighereducation(mapStringField(m.getEchighereducation())); + org + .setEcinternationalorganizationeurinterests( + mapStringField(m.getEcinternationalorganizationeurinterests())); + org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); + org.setEcenterprise(mapStringField(m.getEcenterprise())); + org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); + org.setEcnutscode(mapStringField(m.getEcnutscode())); + org.setCountry(mapQualifier(m.getCountry())); - return org; - } + return org; + } - private static Datasource convertDataSource(OafProtos.Oaf oaf) { - final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); - final Datasource datasource = setOaf(new Datasource(), oaf); - setEntity(datasource, oaf); - datasource.setAccessinfopackage( - m.getAccessinfopackageList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setCertificates(mapStringField(m.getCertificates())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setContactemail(mapStringField(m.getContactemail())); - datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); - datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); - datasource.setDataprovider(mapBoolField(m.getDataprovider())); - datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); - datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); - datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); - datasource.setDescription(mapStringField(m.getDescription())); - datasource.setEnglishname(mapStringField(m.getEnglishname())); - datasource.setLatitude(mapStringField(m.getLatitude())); - datasource.setLongitude(mapStringField(m.getLongitude())); - datasource.setLogourl(mapStringField(m.getLogourl())); - datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); - datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); - datasource.setOdcontenttypes( - m.getOdcontenttypesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdlanguages( - m.getOdlanguagesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); - datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); - datasource.setOdpolicies(mapStringField(m.getOdpolicies())); - datasource.setOfficialname(mapStringField(m.getOfficialname())); - datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); - datasource.setPidsystems(mapStringField(m.getPidsystems())); - datasource.setPolicies( - m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); - datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); - datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); - datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); - datasource.setSubjects( - m.getSubjectsList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - datasource.setVersioning(mapBoolField(m.getVersioning())); - datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); - datasource.setJournal(mapJournal(m.getJournal())); + private static Datasource convertDataSource(OafProtos.Oaf oaf) { + final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); + final Datasource datasource = setOaf(new Datasource(), oaf); + setEntity(datasource, oaf); + datasource + .setAccessinfopackage( + m + .getAccessinfopackageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setCertificates(mapStringField(m.getCertificates())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setContactemail(mapStringField(m.getContactemail())); + datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); + datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); + datasource.setDataprovider(mapBoolField(m.getDataprovider())); + datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); + datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); + datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); + datasource.setDescription(mapStringField(m.getDescription())); + datasource.setEnglishname(mapStringField(m.getEnglishname())); + datasource.setLatitude(mapStringField(m.getLatitude())); + datasource.setLongitude(mapStringField(m.getLongitude())); + datasource.setLogourl(mapStringField(m.getLogourl())); + datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); + datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); + datasource + .setOdcontenttypes( + m + .getOdcontenttypesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource + .setOdlanguages( + m + .getOdlanguagesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); + datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); + datasource.setOdpolicies(mapStringField(m.getOdpolicies())); + datasource.setOfficialname(mapStringField(m.getOfficialname())); + datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); + datasource.setPidsystems(mapStringField(m.getPidsystems())); + datasource + .setPolicies( + m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); + datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); + datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); + datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); + datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); + datasource + .setSubjects( + m + .getSubjectsList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + datasource.setVersioning(mapBoolField(m.getVersioning())); + datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); + datasource.setJournal(mapJournal(m.getJournal())); - return datasource; - } + return datasource; + } - private static Project convertProject(OafProtos.Oaf oaf) { - final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); - final Project project = setOaf(new Project(), oaf); - setEntity(project, oaf); - project.setAcronym(mapStringField(m.getAcronym())); - project.setCallidentifier(mapStringField(m.getCallidentifier())); - project.setCode(mapStringField(m.getCode())); - project.setContactemail(mapStringField(m.getContactemail())); - project.setContactfax(mapStringField(m.getContactfax())); - project.setContactfullname(mapStringField(m.getContactfullname())); - project.setContactphone(mapStringField(m.getContactphone())); - project.setContracttype(mapQualifier(m.getContracttype())); - project.setCurrency(mapStringField(m.getCurrency())); - project.setDuration(mapStringField(m.getDuration())); - project.setEcarticle29_3(mapStringField(m.getEcarticle293())); - project.setEcsc39(mapStringField(m.getEcsc39())); - project.setOamandatepublications(mapStringField(m.getOamandatepublications())); - project.setStartdate(mapStringField(m.getStartdate())); - project.setEnddate(mapStringField(m.getEnddate())); - project.setFundedamount(m.getFundedamount()); - project.setTotalcost(m.getTotalcost()); - project.setKeywords(mapStringField(m.getKeywords())); - project.setSubjects( - m.getSubjectsList().stream() - .map(sp -> mapStructuredProperty(sp)) - .collect(Collectors.toList())); - project.setTitle(mapStringField(m.getTitle())); - project.setWebsiteurl(mapStringField(m.getWebsiteurl())); - project.setFundingtree( - m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList())); - project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); - project.setSummary(mapStringField(m.getSummary())); - project.setOptional1(mapStringField(m.getOptional1())); - project.setOptional2(mapStringField(m.getOptional2())); - return project; - } + private static Project convertProject(OafProtos.Oaf oaf) { + final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); + final Project project = setOaf(new Project(), oaf); + setEntity(project, oaf); + project.setAcronym(mapStringField(m.getAcronym())); + project.setCallidentifier(mapStringField(m.getCallidentifier())); + project.setCode(mapStringField(m.getCode())); + project.setContactemail(mapStringField(m.getContactemail())); + project.setContactfax(mapStringField(m.getContactfax())); + project.setContactfullname(mapStringField(m.getContactfullname())); + project.setContactphone(mapStringField(m.getContactphone())); + project.setContracttype(mapQualifier(m.getContracttype())); + project.setCurrency(mapStringField(m.getCurrency())); + project.setDuration(mapStringField(m.getDuration())); + project.setEcarticle29_3(mapStringField(m.getEcarticle293())); + project.setEcsc39(mapStringField(m.getEcsc39())); + project.setOamandatepublications(mapStringField(m.getOamandatepublications())); + project.setStartdate(mapStringField(m.getStartdate())); + project.setEnddate(mapStringField(m.getEnddate())); + project.setFundedamount(m.getFundedamount()); + project.setTotalcost(m.getTotalcost()); + project.setKeywords(mapStringField(m.getKeywords())); + project + .setSubjects( + m + .getSubjectsList() + .stream() + .map(sp -> mapStructuredProperty(sp)) + .collect(Collectors.toList())); + project.setTitle(mapStringField(m.getTitle())); + project.setWebsiteurl(mapStringField(m.getWebsiteurl())); + project + .setFundingtree( + m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList())); + project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); + project.setSummary(mapStringField(m.getSummary())); + project.setOptional1(mapStringField(m.getOptional1())); + project.setOptional2(mapStringField(m.getOptional2())); + return project; + } - private static Result convertResult(OafProtos.Oaf oaf) { - switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { - case "dataset": - return createDataset(oaf); - case "publication": - return createPublication(oaf); - case "software": - return createSoftware(oaf); - case "other": - return createORP(oaf); - default: - Result result = setOaf(new Result(), oaf); - setEntity(result, oaf); - return setResult(result, oaf); - } - } + private static Result convertResult(OafProtos.Oaf oaf) { + switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { + case "dataset": + return createDataset(oaf); + case "publication": + return createPublication(oaf); + case "software": + return createSoftware(oaf); + case "other": + return createORP(oaf); + default: + Result result = setOaf(new Result(), oaf); + setEntity(result, oaf); + return setResult(result, oaf); + } + } - private static Software createSoftware(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Software software = setOaf(new Software(), oaf); - setEntity(software, oaf); - setResult(software, oaf); + private static Software createSoftware(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Software software = setOaf(new Software(), oaf); + setEntity(software, oaf); + setResult(software, oaf); - software.setDocumentationUrl( - m.getDocumentationUrlList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - software.setLicense( - m.getLicenseList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); - software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); - return software; - } + software + .setDocumentationUrl( + m + .getDocumentationUrlList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + software + .setLicense( + m + .getLicenseList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); + software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); + return software; + } - private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); - setEntity(otherResearchProducts, oaf); - setResult(otherResearchProducts, oaf); - otherResearchProducts.setContactperson( - m.getContactpersonList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setContactgroup( - m.getContactgroupList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setTool( - m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList())); + private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); + setEntity(otherResearchProducts, oaf); + setResult(otherResearchProducts, oaf); + otherResearchProducts + .setContactperson( + m + .getContactpersonList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts + .setContactgroup( + m + .getContactgroupList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts + .setTool( + m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList())); - return otherResearchProducts; - } + return otherResearchProducts; + } - private static Publication createPublication(OafProtos.Oaf oaf) { + private static Publication createPublication(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Publication publication = setOaf(new Publication(), oaf); - setEntity(publication, oaf); - setResult(publication, oaf); - publication.setJournal(mapJournal(m.getJournal())); - return publication; - } + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Publication publication = setOaf(new Publication(), oaf); + setEntity(publication, oaf); + setResult(publication, oaf); + publication.setJournal(mapJournal(m.getJournal())); + return publication; + } - private static Dataset createDataset(OafProtos.Oaf oaf) { + private static Dataset createDataset(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Dataset dataset = setOaf(new Dataset(), oaf); - setEntity(dataset, oaf); - setResult(dataset, oaf); - dataset.setStoragedate(mapStringField(m.getStoragedate())); - dataset.setDevice(mapStringField(m.getDevice())); - dataset.setSize(mapStringField(m.getSize())); - dataset.setVersion(mapStringField(m.getVersion())); - dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); - dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); - dataset.setGeolocation( - m.getGeolocationList().stream() - .map(ProtoConverter::mapGeolocation) - .collect(Collectors.toList())); - return dataset; - } + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Dataset dataset = setOaf(new Dataset(), oaf); + setEntity(dataset, oaf); + setResult(dataset, oaf); + dataset.setStoragedate(mapStringField(m.getStoragedate())); + dataset.setDevice(mapStringField(m.getDevice())); + dataset.setSize(mapStringField(m.getSize())); + dataset.setVersion(mapStringField(m.getVersion())); + dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); + dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); + dataset + .setGeolocation( + m + .getGeolocationList() + .stream() + .map(ProtoConverter::mapGeolocation) + .collect(Collectors.toList())); + return dataset; + } - public static T setOaf(T oaf, OafProtos.Oaf o) { - oaf.setDataInfo(mapDataInfo(o.getDataInfo())); - oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); - return oaf; - } + public static T setOaf(T oaf, OafProtos.Oaf o) { + oaf.setDataInfo(mapDataInfo(o.getDataInfo())); + oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); + return oaf; + } - public static T setEntity(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final OafProtos.OafEntity e = oaf.getEntity(); - entity.setId(e.getId()); - entity.setOriginalId(e.getOriginalIdList()); - entity.setCollectedfrom( - e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - entity.setPid( - e.getPidList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDateofcollection(e.getDateofcollection()); - entity.setDateoftransformation(e.getDateoftransformation()); - entity.setExtraInfo( - e.getExtraInfoList().stream() - .map(ProtoConverter::mapExtraInfo) - .collect(Collectors.toList())); - return entity; - } + public static T setEntity(T entity, OafProtos.Oaf oaf) { + // setting Entity fields + final OafProtos.OafEntity e = oaf.getEntity(); + entity.setId(e.getId()); + entity.setOriginalId(e.getOriginalIdList()); + entity + .setCollectedfrom( + e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); + entity + .setPid( + e + .getPidList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDateofcollection(e.getDateofcollection()); + entity.setDateoftransformation(e.getDateoftransformation()); + entity + .setExtraInfo( + e + .getExtraInfoList() + .stream() + .map(ProtoConverter::mapExtraInfo) + .collect(Collectors.toList())); + return entity; + } - public static T setResult(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - entity.setAuthor( - m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList())); - entity.setResulttype(mapQualifier(m.getResulttype())); - entity.setLanguage(mapQualifier(m.getLanguage())); - entity.setCountry( - m.getCountryList().stream() - .map(ProtoConverter::mapQualifierAsCountry) - .collect(Collectors.toList())); - entity.setSubject( - m.getSubjectList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setTitle( - m.getTitleList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setRelevantdate( - m.getRelevantdateList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDescription( - m.getDescriptionList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); - entity.setPublisher(mapStringField(m.getPublisher())); - entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); - entity.setSource( - m.getSourceList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFulltext( - m.getFulltextList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFormat( - m.getFormatList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContributor( - m.getContributorList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setResourcetype(mapQualifier(m.getResourcetype())); - entity.setCoverage( - m.getCoverageList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContext( - m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList())); + public static T setResult(T entity, OafProtos.Oaf oaf) { + // setting Entity fields + final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + entity + .setAuthor( + m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList())); + entity.setResulttype(mapQualifier(m.getResulttype())); + entity.setLanguage(mapQualifier(m.getLanguage())); + entity + .setCountry( + m + .getCountryList() + .stream() + .map(ProtoConverter::mapQualifierAsCountry) + .collect(Collectors.toList())); + entity + .setSubject( + m + .getSubjectList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setTitle( + m + .getTitleList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setRelevantdate( + m + .getRelevantdateList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setDescription( + m + .getDescriptionList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); + entity.setPublisher(mapStringField(m.getPublisher())); + entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); + entity + .setSource( + m + .getSourceList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setFulltext( + m + .getFulltextList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setFormat( + m + .getFormatList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setContributor( + m + .getContributorList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setResourcetype(mapQualifier(m.getResourcetype())); + entity + .setCoverage( + m + .getCoverageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setContext( + m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList())); - entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); + entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); - return entity; - } + return entity; + } - private static Qualifier getBestAccessRights(List instanceList) { - if (instanceList != null) { - final Optional min = - instanceList.stream().map(i -> i.getAccessright()).min(new LicenseComparator()); + private static Qualifier getBestAccessRights(List instanceList) { + if (instanceList != null) { + final Optional min = instanceList + .stream() + .map(i -> i.getAccessright()) + .min(new LicenseComparator()); - final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); + final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); - if (StringUtils.isBlank(rights.getClassid())) { - rights.setClassid(UNKNOWN); - } - if (StringUtils.isBlank(rights.getClassname()) - || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { - rights.setClassname(NOT_AVAILABLE); - } - if (StringUtils.isBlank(rights.getSchemeid())) { - rights.setSchemeid(DNET_ACCESS_MODES); - } - if (StringUtils.isBlank(rights.getSchemename())) { - rights.setSchemename(DNET_ACCESS_MODES); - } + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) + || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } - return rights; - } - return null; - } + return rights; + } + return null; + } - private static Context mapContext(ResultProtos.Result.Context context) { + private static Context mapContext(ResultProtos.Result.Context context) { - final Context entity = new Context(); - entity.setId(context.getId()); - entity.setDataInfo( - context.getDataInfoList().stream() - .map(ProtoConverter::mapDataInfo) - .collect(Collectors.toList())); - return entity; - } + final Context entity = new Context(); + entity.setId(context.getId()); + entity + .setDataInfo( + context + .getDataInfoList() + .stream() + .map(ProtoConverter::mapDataInfo) + .collect(Collectors.toList())); + return entity; + } - public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { - final KeyValue keyValue = new KeyValue(); - keyValue.setKey(kv.getKey()); - keyValue.setValue(kv.getValue()); - keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); - return keyValue; - } + public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + final KeyValue keyValue = new KeyValue(); + keyValue.setKey(kv.getKey()); + keyValue.setValue(kv.getValue()); + keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); + return keyValue; + } - public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { - final DataInfo dataInfo = new DataInfo(); - dataInfo.setDeletedbyinference(d.getDeletedbyinference()); - dataInfo.setInferenceprovenance(d.getInferenceprovenance()); - dataInfo.setInferred(d.getInferred()); - dataInfo.setInvisible(d.getInvisible()); - dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); - dataInfo.setTrust(d.getTrust()); - return dataInfo; - } + public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(d.getDeletedbyinference()); + dataInfo.setInferenceprovenance(d.getInferenceprovenance()); + dataInfo.setInferred(d.getInferred()); + dataInfo.setInvisible(d.getInvisible()); + dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); + dataInfo.setTrust(d.getTrust()); + return dataInfo; + } - public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { - final Qualifier qualifier = new Qualifier(); - qualifier.setClassid(q.getClassid()); - qualifier.setClassname(q.getClassname()); - qualifier.setSchemeid(q.getSchemeid()); - qualifier.setSchemename(q.getSchemename()); - return qualifier; - } + public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(q.getClassid()); + qualifier.setClassname(q.getClassname()); + qualifier.setSchemeid(q.getSchemeid()); + qualifier.setSchemename(q.getSchemename()); + return qualifier; + } - public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { - final Country c = new Country(); - c.setClassid(q.getClassid()); - c.setClassname(q.getClassname()); - c.setSchemeid(q.getSchemeid()); - c.setSchemename(q.getSchemename()); - c.setDataInfo(mapDataInfo(q.getDataInfo())); - return c; - } + public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { + final Country c = new Country(); + c.setClassid(q.getClassid()); + c.setClassname(q.getClassname()); + c.setSchemeid(q.getSchemeid()); + c.setSchemename(q.getSchemename()); + c.setDataInfo(mapDataInfo(q.getDataInfo())); + return c; + } - public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { - final StructuredProperty structuredProperty = new StructuredProperty(); - structuredProperty.setValue(sp.getValue()); - structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); - structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); - return structuredProperty; - } + public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(sp.getValue()); + structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); + structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); + return structuredProperty; + } - public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { - final ExtraInfo entity = new ExtraInfo(); - entity.setName(extraInfo.getName()); - entity.setTypology(extraInfo.getTypology()); - entity.setProvenance(extraInfo.getProvenance()); - entity.setTrust(extraInfo.getTrust()); - entity.setValue(extraInfo.getValue()); - return entity; - } + public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { + final ExtraInfo entity = new ExtraInfo(); + entity.setName(extraInfo.getName()); + entity.setTypology(extraInfo.getTypology()); + entity.setProvenance(extraInfo.getProvenance()); + entity.setTrust(extraInfo.getTrust()); + entity.setValue(extraInfo.getValue()); + return entity; + } - public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { - final OAIProvenance entity = new OAIProvenance(); - entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); - return entity; - } + public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { + final OAIProvenance entity = new OAIProvenance(); + entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); + return entity; + } - public static OriginDescription mapOriginalDescription( - FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { - final OriginDescription originDescriptionResult = new OriginDescription(); - originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); - originDescriptionResult.setAltered(originDescription.getAltered()); - originDescriptionResult.setBaseURL(originDescription.getBaseURL()); - originDescriptionResult.setIdentifier(originDescription.getIdentifier()); - originDescriptionResult.setDatestamp(originDescription.getDatestamp()); - originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); - return originDescriptionResult; - } + public static OriginDescription mapOriginalDescription( + FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { + final OriginDescription originDescriptionResult = new OriginDescription(); + originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); + originDescriptionResult.setAltered(originDescription.getAltered()); + originDescriptionResult.setBaseURL(originDescription.getBaseURL()); + originDescriptionResult.setIdentifier(originDescription.getIdentifier()); + originDescriptionResult.setDatestamp(originDescription.getDatestamp()); + originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); + return originDescriptionResult; + } - public static Field mapStringField(FieldTypeProtos.StringField s) { - final Field stringField = new Field<>(); - stringField.setValue(s.getValue()); - stringField.setDataInfo(mapDataInfo(s.getDataInfo())); - return stringField; - } + public static Field mapStringField(FieldTypeProtos.StringField s) { + final Field stringField = new Field<>(); + stringField.setValue(s.getValue()); + stringField.setDataInfo(mapDataInfo(s.getDataInfo())); + return stringField; + } - public static Field mapBoolField(FieldTypeProtos.BoolField b) { - final Field booleanField = new Field<>(); - booleanField.setValue(b.getValue()); - booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); - return booleanField; - } + public static Field mapBoolField(FieldTypeProtos.BoolField b) { + final Field booleanField = new Field<>(); + booleanField.setValue(b.getValue()); + booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); + return booleanField; + } - public static Field mapIntField(FieldTypeProtos.IntField b) { - final Field entity = new Field<>(); - entity.setValue(b.getValue()); - entity.setDataInfo(mapDataInfo(b.getDataInfo())); - return entity; - } + public static Field mapIntField(FieldTypeProtos.IntField b) { + final Field entity = new Field<>(); + entity.setValue(b.getValue()); + entity.setDataInfo(mapDataInfo(b.getDataInfo())); + return entity; + } - public static Journal mapJournal(FieldTypeProtos.Journal j) { - final Journal journal = new Journal(); - journal.setConferencedate(j.getConferencedate()); - journal.setConferenceplace(j.getConferenceplace()); - journal.setEdition(j.getEdition()); - journal.setEp(j.getEp()); - journal.setIss(j.getIss()); - journal.setIssnLinking(j.getIssnLinking()); - journal.setIssnOnline(j.getIssnOnline()); - journal.setIssnPrinted(j.getIssnPrinted()); - journal.setName(j.getName()); - journal.setSp(j.getSp()); - journal.setVol(j.getVol()); - journal.setDataInfo(mapDataInfo(j.getDataInfo())); - return journal; - } + public static Journal mapJournal(FieldTypeProtos.Journal j) { + final Journal journal = new Journal(); + journal.setConferencedate(j.getConferencedate()); + journal.setConferenceplace(j.getConferenceplace()); + journal.setEdition(j.getEdition()); + journal.setEp(j.getEp()); + journal.setIss(j.getIss()); + journal.setIssnLinking(j.getIssnLinking()); + journal.setIssnOnline(j.getIssnOnline()); + journal.setIssnPrinted(j.getIssnPrinted()); + journal.setName(j.getName()); + journal.setSp(j.getSp()); + journal.setVol(j.getVol()); + journal.setDataInfo(mapDataInfo(j.getDataInfo())); + return journal; + } - public static Author mapAuthor(FieldTypeProtos.Author author) { - final Author entity = new Author(); - entity.setFullname(author.getFullname()); - entity.setName(author.getName()); - entity.setSurname(author.getSurname()); - entity.setRank(author.getRank()); - entity.setPid( - author.getPidList().stream() - .map( - kv -> { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(kv.getValue()); - final Qualifier q = new Qualifier(); - q.setClassid(kv.getKey()); - q.setClassname(kv.getKey()); - sp.setQualifier(q); - return sp; - }) - .collect(Collectors.toList())); - entity.setAffiliation( - author.getAffiliationList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - return entity; - } + public static Author mapAuthor(FieldTypeProtos.Author author) { + final Author entity = new Author(); + entity.setFullname(author.getFullname()); + entity.setName(author.getName()); + entity.setSurname(author.getSurname()); + entity.setRank(author.getRank()); + entity + .setPid( + author + .getPidList() + .stream() + .map( + kv -> { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(kv.getValue()); + final Qualifier q = new Qualifier(); + q.setClassid(kv.getKey()); + q.setClassname(kv.getKey()); + sp.setQualifier(q); + return sp; + }) + .collect(Collectors.toList())); + entity + .setAffiliation( + author + .getAffiliationList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + return entity; + } - public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { - final GeoLocation entity = new GeoLocation(); - entity.setPoint(geoLocation.getPoint()); - entity.setBox(geoLocation.getBox()); - entity.setPlace(geoLocation.getPlace()); - return entity; - } + public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { + final GeoLocation entity = new GeoLocation(); + entity.setPoint(geoLocation.getPoint()); + entity.setBox(geoLocation.getBox()); + entity.setPlace(geoLocation.getPlace()); + return entity; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java index d200ac18f3..490668606e 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java @@ -1,23 +1,14 @@ + package eu.dnetlib.dhp.actionmanager.migration; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.protobuf.InvalidProtocolBufferException; -import eu.dnetlib.data.proto.OafProtos; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.IOException; import java.io.Serializable; import java.util.LinkedList; import java.util.Objects; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; @@ -29,136 +20,153 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.protobuf.InvalidProtocolBufferException; + +import eu.dnetlib.data.proto.OafProtos; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; public class TransformActions implements Serializable { - private static final Logger log = LoggerFactory.getLogger(TransformActions.class); + private static final Logger log = LoggerFactory.getLogger(TransformActions.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final String SEPARATOR = "/"; + private static final String SEPARATOR = "/"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateActionSet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - final String inputPaths = parser.get("inputPaths"); + final String inputPaths = parser.get("inputPaths"); - if (StringUtils.isBlank(inputPaths)) { - throw new RuntimeException("empty inputPaths"); - } - log.info("inputPaths: {}", inputPaths); + if (StringUtils.isBlank(inputPaths)) { + throw new RuntimeException("empty inputPaths"); + } + log.info("inputPaths: {}", inputPaths); - final String targetBaseDir = getTargetBaseDir(isLookupUrl); + final String targetBaseDir = getTargetBaseDir(isLookupUrl); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark)); - } + runWithSparkSession( + conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark)); + } - private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark) - throws IOException { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark) + throws IOException { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { + for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); - final String rawset = pathQ.pollLast(); - final String actionSetDirectory = pathQ.pollLast(); + final String rawset = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); - final Path targetDirectory = - new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); + final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); - if (fs.exists(targetDirectory)) { - log.info("found target directory '{}", targetDirectory); - fs.delete(targetDirectory, true); - log.info("deleted target directory '{}", targetDirectory); - } + if (fs.exists(targetDirectory)) { + log.info("found target directory '{}", targetDirectory); + fs.delete(targetDirectory, true); + log.info("deleted target directory '{}", targetDirectory); + } - log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory); + log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory); - sc.sequenceFile(sourcePath, Text.class, Text.class) - .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) - .map(TransformActions::doTransform) - .filter(Objects::nonNull) - .mapToPair( - a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a))) - .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) - .saveAsNewAPIHadoopFile( - targetDirectory.toString(), - Text.class, - Text.class, - SequenceFileOutputFormat.class, - sc.hadoopConfiguration()); - } - } + sc + .sequenceFile(sourcePath, Text.class, Text.class) + .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) + .map(TransformActions::doTransform) + .filter(Objects::nonNull) + .mapToPair( + a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a))) + .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) + .saveAsNewAPIHadoopFile( + targetDirectory.toString(), + Text.class, + Text.class, + SequenceFileOutputFormat.class, + sc.hadoopConfiguration()); + } + } - private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) - throws InvalidProtocolBufferException { + private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) + throws InvalidProtocolBufferException { - // dedup similarity relations had empty target value, don't migrate them - if (aa.getTargetValue().length == 0) { - return null; - } - final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); - final Oaf oaf = ProtoConverter.convert(proto_oaf); - switch (proto_oaf.getKind()) { - case entity: - switch (proto_oaf.getEntity().getType()) { - case datasource: - return new AtomicAction<>(Datasource.class, (Datasource) oaf); - case organization: - return new AtomicAction<>(Organization.class, (Organization) oaf); - case project: - return new AtomicAction<>(Project.class, (Project) oaf); - case result: - final String resulttypeid = - proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid(); - switch (resulttypeid) { - case "publication": - return new AtomicAction<>(Publication.class, (Publication) oaf); - case "software": - return new AtomicAction<>(Software.class, (Software) oaf); - case "other": - return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); - case "dataset": - return new AtomicAction<>(Dataset.class, (Dataset) oaf); - default: - // can be an update, where the resulttype is not specified - return new AtomicAction<>(Result.class, (Result) oaf); - } - default: - throw new IllegalArgumentException( - "invalid entity type: " + proto_oaf.getEntity().getType()); - } - case relation: - return new AtomicAction<>(Relation.class, (Relation) oaf); - default: - throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); - } - } + // dedup similarity relations had empty target value, don't migrate them + if (aa.getTargetValue().length == 0) { + return null; + } + final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); + final Oaf oaf = ProtoConverter.convert(proto_oaf); + switch (proto_oaf.getKind()) { + case entity: + switch (proto_oaf.getEntity().getType()) { + case datasource: + return new AtomicAction<>(Datasource.class, (Datasource) oaf); + case organization: + return new AtomicAction<>(Organization.class, (Organization) oaf); + case project: + return new AtomicAction<>(Project.class, (Project) oaf); + case result: + final String resulttypeid = proto_oaf + .getEntity() + .getResult() + .getMetadata() + .getResulttype() + .getClassid(); + switch (resulttypeid) { + case "publication": + return new AtomicAction<>(Publication.class, (Publication) oaf); + case "software": + return new AtomicAction<>(Software.class, (Software) oaf); + case "other": + return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); + case "dataset": + return new AtomicAction<>(Dataset.class, (Dataset) oaf); + default: + // can be an update, where the resulttype is not specified + return new AtomicAction<>(Result.class, (Result) oaf); + } + default: + throw new IllegalArgumentException( + "invalid entity type: " + proto_oaf.getEntity().getType()); + } + case relation: + return new AtomicAction<>(Relation.class, (Relation) oaf); + default: + throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); + } + } - private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - String XQUERY = - "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; - return isLookUp.getResourceProfileByQuery(XQUERY); - } + private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; + return isLookUp.getResourceProfileByQuery(XQUERY); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java index 6eb0bac3b0..af3ef0c12e 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java @@ -1,15 +1,13 @@ + package eu.dnetlib.dhp.actionmanager.partition; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static org.apache.spark.sql.functions.*; -import eu.dnetlib.dhp.actionmanager.ISClient; -import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; import java.util.Arrays; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -20,117 +18,127 @@ import org.apache.spark.sql.types.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.actionmanager.ISClient; +import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; + /** Partitions given set of action sets by payload type. */ public class PartitionActionSetsByPayloadTypeJob { - private static final Logger logger = - LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger logger = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final StructType KV_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType KV_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()))); - private static final StructType ATOMIC_ACTION_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply( - "payload", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$ + .apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); - private ISClient isClient; + private ISClient isClient; - public PartitionActionSetsByPayloadTypeJob(String isLookupUrl) { - this.isClient = new ISClient(isLookupUrl); - } + public PartitionActionSetsByPayloadTypeJob(String isLookupUrl) { + this.isClient = new ISClient(isLookupUrl); + } - public PartitionActionSetsByPayloadTypeJob() {} + public PartitionActionSetsByPayloadTypeJob() { + } - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PromoteActionPayloadForGraphTableJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputActionSetIds = parser.get("inputActionSetIds"); - logger.info("inputActionSetIds: {}", inputActionSetIds); + String inputActionSetIds = parser.get("inputActionSetIds"); + logger.info("inputActionSetIds: {}", inputActionSetIds); - String outputPath = parser.get("outputPath"); - logger.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + logger.info("outputPath: {}", outputPath); - String isLookupUrl = parser.get("isLookupUrl"); - logger.info("isLookupUrl: {}", isLookupUrl); + String isLookupUrl = parser.get("isLookupUrl"); + logger.info("isLookupUrl: {}", isLookupUrl); - new PartitionActionSetsByPayloadTypeJob(isLookupUrl) - .run(isSparkSessionManaged, inputActionSetIds, outputPath); - } + new PartitionActionSetsByPayloadTypeJob(isLookupUrl) + .run(isSparkSessionManaged, inputActionSetIds, outputPath); + } - protected void run(Boolean isSparkSessionManaged, String inputActionSetIds, String outputPath) { + protected void run(Boolean isSparkSessionManaged, String inputActionSetIds, String outputPath) { - List inputActionSetPaths = getIsClient().getLatestRawsetPaths(inputActionSetIds); - logger.info("inputActionSetPaths: {}", String.join(",", inputActionSetPaths)); + List inputActionSetPaths = getIsClient().getLatestRawsetPaths(inputActionSetIds); + logger.info("inputActionSetPaths: {}", String.join(",", inputActionSetPaths)); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - readAndWriteActionSetsFromPaths(spark, inputActionSetPaths, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + readAndWriteActionSetsFromPaths(spark, inputActionSetPaths, outputPath); + }); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static void readAndWriteActionSetsFromPaths( - SparkSession spark, List inputActionSetPaths, String outputPath) { - inputActionSetPaths.stream() - .filter(path -> HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) - .forEach( - inputActionSetPath -> { - Dataset actionDS = readActionSetFromPath(spark, inputActionSetPath); - saveActions(actionDS, outputPath); - }); - } + private static void readAndWriteActionSetsFromPaths( + SparkSession spark, List inputActionSetPaths, String outputPath) { + inputActionSetPaths + .stream() + .filter(path -> HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) + .forEach( + inputActionSetPath -> { + Dataset actionDS = readActionSetFromPath(spark, inputActionSetPath); + saveActions(actionDS, outputPath); + }); + } - private static Dataset readActionSetFromPath(SparkSession spark, String path) { - logger.info("Reading actions from path: {}", path); + private static Dataset readActionSetFromPath(SparkSession spark, String path) { + logger.info("Reading actions from path: {}", path); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD rdd = - sc.sequenceFile(path, Text.class, Text.class) - .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); + JavaRDD rdd = sc + .sequenceFile(path, Text.class, Text.class) + .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); - return spark - .createDataFrame(rdd, KV_SCHEMA) - .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) - .select(expr("atomic_action.*")); - } + return spark + .createDataFrame(rdd, KV_SCHEMA) + .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) + .select(expr("atomic_action.*")); + } - private static void saveActions(Dataset actionDS, String path) { - logger.info("Saving actions to path: {}", path); - actionDS.write().partitionBy("clazz").mode(SaveMode.Append).parquet(path); - } + private static void saveActions(Dataset actionDS, String path) { + logger.info("Saving actions to path: {}", path); + actionDS.write().partitionBy("clazz").mode(SaveMode.Append).parquet(path); + } - public ISClient getIsClient() { - return isClient; - } + public ISClient getIsClient() { + return isClient; + } - public void setIsClient(ISClient isClient) { - this.isClient = isClient; - } + public void setIsClient(ISClient isClient) { + this.isClient = isClient; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java index ac82918429..fbb0729571 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java @@ -1,82 +1,87 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import java.util.function.BiFunction; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Relation; -import java.util.function.BiFunction; /** OAF model merging support. */ public class MergeAndGet { - private MergeAndGet() {} + private MergeAndGet() { + } - /** - * Strategy for merging OAF model objects. - * - *

MERGE_FROM_AND_GET: use OAF 'mergeFrom' method SELECT_NEWER_AND_GET: use last update - * timestamp to return newer instance - */ - public enum Strategy { - MERGE_FROM_AND_GET, - SELECT_NEWER_AND_GET - } + /** + * Strategy for merging OAF model objects. + *

+ * MERGE_FROM_AND_GET: use OAF 'mergeFrom' method SELECT_NEWER_AND_GET: use last update timestamp to return newer + * instance + */ + public enum Strategy { + MERGE_FROM_AND_GET, SELECT_NEWER_AND_GET + } - /** - * Returns a function for merging OAF model objects. - * - * @param strategy Strategy to be used to merge objects - * @param Graph table type - * @param Action payload type - * @return BiFunction to be used to merge OAF objects - */ - public static - SerializableSupplier> functionFor(Strategy strategy) { - switch (strategy) { - case MERGE_FROM_AND_GET: - return () -> MergeAndGet::mergeFromAndGet; - case SELECT_NEWER_AND_GET: - return () -> MergeAndGet::selectNewerAndGet; - } - throw new RuntimeException(); - } + /** + * Returns a function for merging OAF model objects. + * + * @param strategy Strategy to be used to merge objects + * @param Graph table type + * @param Action payload type + * @return BiFunction to be used to merge OAF objects + */ + public static SerializableSupplier> functionFor( + Strategy strategy) { + switch (strategy) { + case MERGE_FROM_AND_GET: + return () -> MergeAndGet::mergeFromAndGet; + case SELECT_NEWER_AND_GET: + return () -> MergeAndGet::selectNewerAndGet; + } + throw new RuntimeException(); + } - private static G mergeFromAndGet(G x, A y) { - if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) { - ((Relation) x).mergeFrom((Relation) y); - return x; - } else if (isSubClass(x, OafEntity.class) - && isSubClass(y, OafEntity.class) - && isSubClass(x, y)) { - ((OafEntity) x).mergeFrom((OafEntity) y); - return x; - } - throw new RuntimeException( - String.format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } + private static G mergeFromAndGet(G x, A y) { + if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) { + ((Relation) x).mergeFrom((Relation) y); + return x; + } else if (isSubClass(x, OafEntity.class) + && isSubClass(y, OafEntity.class) + && isSubClass(x, y)) { + ((OafEntity) x).mergeFrom((OafEntity) y); + return x; + } + throw new RuntimeException( + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } - private static G selectNewerAndGet(G x, A y) { - if (x.getClass().equals(y.getClass()) - && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { - return x; - } else if (x.getClass().equals(y.getClass()) - && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { - return (G) y; - } else if (isSubClass(x, y) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { - return x; - } else if (isSubClass(x, y) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { - throw new RuntimeException( - String.format( - "SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } - throw new RuntimeException( - String.format( - "SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } + private static G selectNewerAndGet(G x, A y) { + if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { + return x; + } else if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { + return (G) y; + } else if (isSubClass(x, y) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { + return x; + } else if (isSubClass(x, y) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { + throw new RuntimeException( + String + .format( + "SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } + throw new RuntimeException( + String + .format( + "SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 24af1973f3..17bfc4af36 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -1,18 +1,14 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -23,204 +19,207 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + /** Applies a given action payload file to graph table of compatible type. */ public class PromoteActionPayloadForGraphTableJob { - private static final Logger logger = - LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); + private static final Logger logger = LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PromoteActionPayloadForGraphTableJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputGraphTablePath = parser.get("inputGraphTablePath"); - logger.info("inputGraphTablePath: {}", inputGraphTablePath); + String inputGraphTablePath = parser.get("inputGraphTablePath"); + logger.info("inputGraphTablePath: {}", inputGraphTablePath); - String graphTableClassName = parser.get("graphTableClassName"); - logger.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + logger.info("graphTableClassName: {}", graphTableClassName); - String inputActionPayloadPath = parser.get("inputActionPayloadPath"); - logger.info("inputActionPayloadPath: {}", inputActionPayloadPath); + String inputActionPayloadPath = parser.get("inputActionPayloadPath"); + logger.info("inputActionPayloadPath: {}", inputActionPayloadPath); - String actionPayloadClassName = parser.get("actionPayloadClassName"); - logger.info("actionPayloadClassName: {}", actionPayloadClassName); + String actionPayloadClassName = parser.get("actionPayloadClassName"); + logger.info("actionPayloadClassName: {}", actionPayloadClassName); - String outputGraphTablePath = parser.get("outputGraphTablePath"); - logger.info("outputGraphTablePath: {}", outputGraphTablePath); + String outputGraphTablePath = parser.get("outputGraphTablePath"); + logger.info("outputGraphTablePath: {}", outputGraphTablePath); - MergeAndGet.Strategy strategy = - MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); - logger.info("strategy: {}", strategy); + MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); + logger.info("strategy: {}", strategy); - Class rowClazz = (Class) Class.forName(graphTableClassName); - Class actionPayloadClazz = - (Class) Class.forName(actionPayloadClassName); + Class rowClazz = (Class) Class.forName(graphTableClassName); + Class actionPayloadClazz = (Class) Class.forName(actionPayloadClassName); - throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); + throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputGraphTablePath); - promoteActionPayloadForGraphTable( - spark, - inputGraphTablePath, - inputActionPayloadPath, - outputGraphTablePath, - strategy, - rowClazz, - actionPayloadClazz); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputGraphTablePath); + promoteActionPayloadForGraphTable( + spark, + inputGraphTablePath, + inputActionPayloadPath, + outputGraphTablePath, + strategy, + rowClazz, + actionPayloadClazz); + }); + } - private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass( - Class rowClazz, Class actionPayloadClazz) { - if (!isSubClass(rowClazz, actionPayloadClazz)) { - String msg = - String.format( - "graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); - throw new RuntimeException(msg); - } - } + private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass( + Class rowClazz, Class actionPayloadClazz) { + if (!isSubClass(rowClazz, actionPayloadClazz)) { + String msg = String + .format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + throw new RuntimeException(msg); + } + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static void promoteActionPayloadForGraphTable( - SparkSession spark, - String inputGraphTablePath, - String inputActionPayloadPath, - String outputGraphTablePath, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); - Dataset actionPayloadDS = - readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); + private static void promoteActionPayloadForGraphTable( + SparkSession spark, + String inputGraphTablePath, + String inputActionPayloadPath, + String outputGraphTablePath, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); + Dataset actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); - Dataset result = - promoteActionPayloadForGraphTable( - rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) - .map((MapFunction) value -> value, Encoders.bean(rowClazz)); + Dataset result = promoteActionPayloadForGraphTable( + rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) + .map((MapFunction) value -> value, Encoders.bean(rowClazz)); - saveGraphTable(result, outputGraphTablePath); - } + saveGraphTable(result, outputGraphTablePath); + } - private static Dataset readGraphTable( - SparkSession spark, String path, Class rowClazz) { - logger.info("Reading graph table from path: {}", path); + private static Dataset readGraphTable( + SparkSession spark, String path, Class rowClazz) { + logger.info("Reading graph table from path: {}", path); - return spark - .read() - .textFile(path) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), - Encoders.bean(rowClazz)); + return spark + .read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), + Encoders.bean(rowClazz)); - /* - * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); - */ - } + /* + * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); + */ + } - private static Dataset readActionPayload( - SparkSession spark, String path, Class actionPayloadClazz) { - logger.info("Reading action payload from path: {}", path); - return spark - .read() - .parquet(path) - .map( - (MapFunction) - value -> - OBJECT_MAPPER.readValue(value.getAs("payload"), actionPayloadClazz), - Encoders.bean(actionPayloadClazz)); - } + private static Dataset readActionPayload( + SparkSession spark, String path, Class actionPayloadClazz) { + logger.info("Reading action payload from path: {}", path); + return spark + .read() + .parquet(path) + .map( + (MapFunction) value -> OBJECT_MAPPER + .readValue(value. getAs("payload"), actionPayloadClazz), + Encoders.bean(actionPayloadClazz)); + } - private static Dataset promoteActionPayloadForGraphTable( - Dataset rowDS, - Dataset actionPayloadDS, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - logger.info( - "Promoting action payload for graph table: payload={}, table={}", - actionPayloadClazz.getSimpleName(), - rowClazz.getSimpleName()); + private static Dataset promoteActionPayloadForGraphTable( + Dataset rowDS, + Dataset actionPayloadDS, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + logger + .info( + "Promoting action payload for graph table: payload={}, table={}", + actionPayloadClazz.getSimpleName(), + rowClazz.getSimpleName()); - SerializableSupplier> rowIdFn = ModelSupport::idFn; - SerializableSupplier> actionPayloadIdFn = ModelSupport::idFn; - SerializableSupplier> mergeRowWithActionPayloadAndGetFn = - MergeAndGet.functionFor(strategy); - SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); - SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = - PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> rowIdFn = ModelSupport::idFn; + SerializableSupplier> actionPayloadIdFn = ModelSupport::idFn; + SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); + SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); + SerializableSupplier zeroFn = zeroFn(rowClazz); + SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; - Dataset joinedAndMerged = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeRowWithActionPayloadAndGetFn, - rowClazz, - actionPayloadClazz); + Dataset joinedAndMerged = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeRowWithActionPayloadAndGetFn, + rowClazz, + actionPayloadClazz); - return PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( - joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); - } + return PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); + } - private static SerializableSupplier zeroFn(Class clazz) { - switch (clazz.getCanonicalName()) { - case "eu.dnetlib.dhp.schema.oaf.Dataset": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Dataset()); - case "eu.dnetlib.dhp.schema.oaf.Datasource": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Datasource()); - case "eu.dnetlib.dhp.schema.oaf.Organization": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Organization()); - case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.OtherResearchProduct()); - case "eu.dnetlib.dhp.schema.oaf.Project": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Project()); - case "eu.dnetlib.dhp.schema.oaf.Publication": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Publication()); - case "eu.dnetlib.dhp.schema.oaf.Relation": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation()); - case "eu.dnetlib.dhp.schema.oaf.Software": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software()); - default: - throw new RuntimeException("unknown class: " + clazz.getCanonicalName()); - } - } + private static SerializableSupplier zeroFn(Class clazz) { + switch (clazz.getCanonicalName()) { + case "eu.dnetlib.dhp.schema.oaf.Dataset": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Dataset()); + case "eu.dnetlib.dhp.schema.oaf.Datasource": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Datasource()); + case "eu.dnetlib.dhp.schema.oaf.Organization": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Organization()); + case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.OtherResearchProduct()); + case "eu.dnetlib.dhp.schema.oaf.Project": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Project()); + case "eu.dnetlib.dhp.schema.oaf.Publication": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Publication()); + case "eu.dnetlib.dhp.schema.oaf.Relation": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation()); + case "eu.dnetlib.dhp.schema.oaf.Software": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software()); + default: + throw new RuntimeException("unknown class: " + clazz.getCanonicalName()); + } + } - private static Function isNotZeroFnUsingIdOrSource() { - return t -> { - if (isSubClass(t, Relation.class)) { - return Objects.nonNull(((Relation) t).getSource()); - } - return Objects.nonNull(((OafEntity) t).getId()); - }; - } + private static Function isNotZeroFnUsingIdOrSource() { + return t -> { + if (isSubClass(t, Relation.class)) { + return Objects.nonNull(((Relation) t).getSource()); + } + return Objects.nonNull(((OafEntity) t).getId()); + }; + } - private static void saveGraphTable(Dataset result, String path) { - logger.info("Saving graph table to path: {}", path); - result.toJSON().write().option("compression", "gzip").text(path); - } + private static void saveGraphTable(Dataset result, String path) { + logger.info("Saving graph table to path: {}", path); + result.toJSON().write().option("compression", "gzip").text(path); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index cff9640033..ffde658bd4 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -1,13 +1,13 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -15,171 +15,170 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.TypedColumn; import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.Oaf; import scala.Tuple2; /** Promote action payload functions. */ public class PromoteActionPayloadFunctions { - private PromoteActionPayloadFunctions() {} + private PromoteActionPayloadFunctions() { + } - /** - * Joins dataset representing graph table with dataset representing action payload using supplied - * functions. - * - * @param rowDS Dataset representing graph table - * @param actionPayloadDS Dataset representing action payload - * @param rowIdFn Function used to get the id of graph table row - * @param actionPayloadIdFn Function used to get id of action payload instance - * @param mergeAndGetFn Function used to merge graph table row and action payload instance - * @param rowClazz Class of graph table - * @param actionPayloadClazz Class of action payload - * @param Type of graph table row - * @param Type of action payload instance - * @return Dataset of merged graph table rows and action payload instances - */ - public static Dataset joinGraphTableWithActionPayloadAndMerge( - Dataset rowDS, - Dataset actionPayloadDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> actionPayloadIdFn, - SerializableSupplier> mergeAndGetFn, - Class rowClazz, - Class actionPayloadClazz) { - if (!isSubClass(rowClazz, actionPayloadClazz)) { - throw new RuntimeException( - "action payload type must be the same or be a super type of table row type"); - } + /** + * Joins dataset representing graph table with dataset representing action payload using supplied functions. + * + * @param rowDS Dataset representing graph table + * @param actionPayloadDS Dataset representing action payload + * @param rowIdFn Function used to get the id of graph table row + * @param actionPayloadIdFn Function used to get id of action payload instance + * @param mergeAndGetFn Function used to merge graph table row and action payload instance + * @param rowClazz Class of graph table + * @param actionPayloadClazz Class of action payload + * @param Type of graph table row + * @param Type of action payload instance + * @return Dataset of merged graph table rows and action payload instances + */ + public static Dataset joinGraphTableWithActionPayloadAndMerge( + Dataset rowDS, + Dataset actionPayloadDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> actionPayloadIdFn, + SerializableSupplier> mergeAndGetFn, + Class rowClazz, + Class actionPayloadClazz) { + if (!isSubClass(rowClazz, actionPayloadClazz)) { + throw new RuntimeException( + "action payload type must be the same or be a super type of table row type"); + } - Dataset> rowWithIdDS = mapToTupleWithId(rowDS, rowIdFn, rowClazz); - Dataset> actionPayloadWithIdDS = - mapToTupleWithId(actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); + Dataset> rowWithIdDS = mapToTupleWithId(rowDS, rowIdFn, rowClazz); + Dataset> actionPayloadWithIdDS = mapToTupleWithId( + actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); - return rowWithIdDS - .joinWith( - actionPayloadWithIdDS, - rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), - "full_outer") - .map( - (MapFunction, Tuple2>, G>) - value -> { - Optional rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2); - Optional actionPayloadOpt = Optional.ofNullable(value._2()).map(Tuple2::_2); - return rowOpt - .map( - row -> - actionPayloadOpt - .map( - actionPayload -> - mergeAndGetFn.get().apply(row, actionPayload)) - .orElse(row)) - .orElseGet( - () -> - actionPayloadOpt - .filter( - actionPayload -> actionPayload.getClass().equals(rowClazz)) - .map(rowClazz::cast) - .orElse(null)); - }, - Encoders.kryo(rowClazz)) - .filter((FilterFunction) Objects::nonNull); - } + return rowWithIdDS + .joinWith( + actionPayloadWithIdDS, + rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), + "full_outer") + .map( + (MapFunction, Tuple2>, G>) value -> { + Optional rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2); + Optional actionPayloadOpt = Optional.ofNullable(value._2()).map(Tuple2::_2); + return rowOpt + .map( + row -> actionPayloadOpt + .map( + actionPayload -> mergeAndGetFn.get().apply(row, actionPayload)) + .orElse(row)) + .orElseGet( + () -> actionPayloadOpt + .filter( + actionPayload -> actionPayload.getClass().equals(rowClazz)) + .map(rowClazz::cast) + .orElse(null)); + }, + Encoders.kryo(rowClazz)) + .filter((FilterFunction) Objects::nonNull); + } - private static Dataset> mapToTupleWithId( - Dataset ds, SerializableSupplier> idFn, Class clazz) { - return ds.map( - (MapFunction>) value -> new Tuple2<>(idFn.get().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - } + private static Dataset> mapToTupleWithId( + Dataset ds, SerializableSupplier> idFn, Class clazz) { + return ds + .map( + (MapFunction>) value -> new Tuple2<>(idFn.get().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + } - /** - * Groups graph table by id and aggregates using supplied functions. - * - * @param rowDS Dataset representing graph table - * @param rowIdFn Function used to get the id of graph table row - * @param mergeAndGetFn Function used to merge graph table rows - * @param zeroFn Function to create a zero/empty instance of graph table row - * @param isNotZeroFn Function to check if graph table row is not zero/empty - * @param rowClazz Class of graph table - * @param Type of graph table row - * @return Dataset of aggregated graph table rows - */ - public static Dataset groupGraphTableByIdAndMerge( - Dataset rowDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier zeroFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { - TypedColumn aggregator = - new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); - return rowDS - .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) - .agg(aggregator) - .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); - } + /** + * Groups graph table by id and aggregates using supplied functions. + * + * @param rowDS Dataset representing graph table + * @param rowIdFn Function used to get the id of graph table row + * @param mergeAndGetFn Function used to merge graph table rows + * @param zeroFn Function to create a zero/empty instance of graph table row + * @param isNotZeroFn Function to check if graph table row is not zero/empty + * @param rowClazz Class of graph table + * @param Type of graph table row + * @return Dataset of aggregated graph table rows + */ + public static Dataset groupGraphTableByIdAndMerge( + Dataset rowDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier zeroFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { + TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); + return rowDS + .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); + } - /** - * Aggregator to be used for aggregating graph table rows during grouping. - * - * @param Type of graph table row - */ - public static class TableAggregator extends Aggregator { - private SerializableSupplier zeroFn; - private SerializableSupplier> mergeAndGetFn; - private SerializableSupplier> isNotZeroFn; - private Class rowClazz; + /** + * Aggregator to be used for aggregating graph table rows during grouping. + * + * @param Type of graph table row + */ + public static class TableAggregator extends Aggregator { + private SerializableSupplier zeroFn; + private SerializableSupplier> mergeAndGetFn; + private SerializableSupplier> isNotZeroFn; + private Class rowClazz; - public TableAggregator( - SerializableSupplier zeroFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { - this.zeroFn = zeroFn; - this.mergeAndGetFn = mergeAndGetFn; - this.isNotZeroFn = isNotZeroFn; - this.rowClazz = rowClazz; - } + public TableAggregator( + SerializableSupplier zeroFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { + this.zeroFn = zeroFn; + this.mergeAndGetFn = mergeAndGetFn; + this.isNotZeroFn = isNotZeroFn; + this.rowClazz = rowClazz; + } - @Override - public G zero() { - return zeroFn.get(); - } + @Override + public G zero() { + return zeroFn.get(); + } - @Override - public G reduce(G b, G a) { - return zeroSafeMergeAndGet(b, a); - } + @Override + public G reduce(G b, G a) { + return zeroSafeMergeAndGet(b, a); + } - @Override - public G merge(G b1, G b2) { - return zeroSafeMergeAndGet(b1, b2); - } + @Override + public G merge(G b1, G b2) { + return zeroSafeMergeAndGet(b1, b2); + } - private G zeroSafeMergeAndGet(G left, G right) { - Function isNotZero = isNotZeroFn.get(); - if (isNotZero.apply(left) && isNotZero.apply(right)) { - return mergeAndGetFn.get().apply(left, right); - } else if (isNotZero.apply(left) && !isNotZero.apply(right)) { - return left; - } else if (!isNotZero.apply(left) && isNotZero.apply(right)) { - return right; - } - throw new RuntimeException("internal aggregation error: left and right objects are zero"); - } + private G zeroSafeMergeAndGet(G left, G right) { + Function isNotZero = isNotZeroFn.get(); + if (isNotZero.apply(left) && isNotZero.apply(right)) { + return mergeAndGetFn.get().apply(left, right); + } else if (isNotZero.apply(left) && !isNotZero.apply(right)) { + return left; + } else if (!isNotZero.apply(left) && isNotZero.apply(right)) { + return right; + } + throw new RuntimeException("internal aggregation error: left and right objects are zero"); + } - @Override - public G finish(G reduction) { - return reduction; - } + @Override + public G finish(G reduction) { + return reduction; + } - @Override - public Encoder bufferEncoder() { - return Encoders.kryo(rowClazz); - } + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(rowClazz); + } - @Override - public Encoder outputEncoder() { - return Encoders.kryo(rowClazz); - } - } + @Override + public Encoder outputEncoder() { + return Encoders.kryo(rowClazz); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java index 3d36cef69b..f51c697f40 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.partition; import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; @@ -5,16 +6,13 @@ import static org.apache.spark.sql.functions.*; import static org.junit.jupiter.api.Assertions.assertIterableEquals; import static scala.collection.JavaConversions.mutableSeqAsJavaList; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.ISClient; -import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; @@ -32,197 +30,212 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.ISClient; +import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; import scala.collection.mutable.Seq; @ExtendWith(MockitoExtension.class) public class PartitionActionSetsByPayloadTypeJobTest { - private static final ClassLoader cl = - PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); + private static final ClassLoader cl = PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); - private static Configuration configuration; - private static SparkSession spark; + private static Configuration configuration; + private static SparkSession spark; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final StructType ATOMIC_ACTION_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply( - "payload", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$ + .apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); - @BeforeAll - public static void beforeAll() throws IOException { - configuration = Job.getInstance().getConfiguration(); - SparkConf conf = new SparkConf(); - conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - conf.setMaster("local"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() throws IOException { + configuration = Job.getInstance().getConfiguration(); + SparkConf conf = new SparkConf(); + conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + conf.setMaster("local"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @DisplayName("Job") - @Nested - class Main { + @DisplayName("Job") + @Nested + class Main { - @Mock private ISClient isClient; + @Mock + private ISClient isClient; - @Test - public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { - // given - Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); - Path outputDir = workingDir.resolve("output"); + @Test + public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { + // given + Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); + Path outputDir = workingDir.resolve("output"); - Map> oafsByClassName = createActionSets(inputActionSetsBaseDir); + Map> oafsByClassName = createActionSets(inputActionSetsBaseDir); - List inputActionSetsPaths = resolveInputActionSetPaths(inputActionSetsBaseDir); + List inputActionSetsPaths = resolveInputActionSetPaths(inputActionSetsBaseDir); - // when - Mockito.when(isClient.getLatestRawsetPaths(Mockito.anyString())) - .thenReturn(inputActionSetsPaths); + // when + Mockito + .when(isClient.getLatestRawsetPaths(Mockito.anyString())) + .thenReturn(inputActionSetsPaths); - PartitionActionSetsByPayloadTypeJob job = new PartitionActionSetsByPayloadTypeJob(); - job.setIsClient(isClient); - job.run( - Boolean.FALSE, - "", // it can be empty we're mocking the response from isClient - // to - // resolve the - // paths - outputDir.toString()); + PartitionActionSetsByPayloadTypeJob job = new PartitionActionSetsByPayloadTypeJob(); + job.setIsClient(isClient); + job + .run( + Boolean.FALSE, + "", // it can be empty we're mocking the response from isClient + // to + // resolve the + // paths + outputDir.toString()); - // then - Files.exists(outputDir); + // then + Files.exists(outputDir); - assertForOafType(outputDir, oafsByClassName, eu.dnetlib.dhp.schema.oaf.Dataset.class); - assertForOafType(outputDir, oafsByClassName, Datasource.class); - assertForOafType(outputDir, oafsByClassName, Organization.class); - assertForOafType(outputDir, oafsByClassName, OtherResearchProduct.class); - assertForOafType(outputDir, oafsByClassName, Project.class); - assertForOafType(outputDir, oafsByClassName, Publication.class); - assertForOafType(outputDir, oafsByClassName, Result.class); - assertForOafType(outputDir, oafsByClassName, Relation.class); - assertForOafType(outputDir, oafsByClassName, Software.class); - } - } + assertForOafType(outputDir, oafsByClassName, eu.dnetlib.dhp.schema.oaf.Dataset.class); + assertForOafType(outputDir, oafsByClassName, Datasource.class); + assertForOafType(outputDir, oafsByClassName, Organization.class); + assertForOafType(outputDir, oafsByClassName, OtherResearchProduct.class); + assertForOafType(outputDir, oafsByClassName, Project.class); + assertForOafType(outputDir, oafsByClassName, Publication.class); + assertForOafType(outputDir, oafsByClassName, Result.class); + assertForOafType(outputDir, oafsByClassName, Relation.class); + assertForOafType(outputDir, oafsByClassName, Software.class); + } + } - private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) throws IOException { - Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); - return Files.list(inputActionSetJsonDumpsDir) - .map( - path -> { - String inputActionSetId = path.getFileName().toString(); - return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); - }) - .collect(Collectors.toCollection(ArrayList::new)); - } + private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) throws IOException { + Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); + return Files + .list(inputActionSetJsonDumpsDir) + .map( + path -> { + String inputActionSetId = path.getFileName().toString(); + return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); + }) + .collect(Collectors.toCollection(ArrayList::new)); + } - private static Map> createActionSets(Path inputActionSetsDir) - throws IOException { - Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); + private static Map> createActionSets(Path inputActionSetsDir) + throws IOException { + Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); - Map> oafsByType = new HashMap<>(); - Files.list(inputActionSetJsonDumpsDir) - .forEach( - inputActionSetJsonDumpFile -> { - String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString(); - Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); + Map> oafsByType = new HashMap<>(); + Files + .list(inputActionSetJsonDumpsDir) + .forEach( + inputActionSetJsonDumpFile -> { + String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString(); + Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); - Dataset actionDS = - readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()).cache(); + Dataset actionDS = readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()).cache(); - writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString()); + writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString()); - Map> actionSetOafsByType = - actionDS - .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) - .select(expr("atomic_action.*")).groupBy(col("clazz")) - .agg(collect_list(col("payload")).as("payload_list")).collectAsList().stream() - .map( - row -> - new AbstractMap.SimpleEntry<>( - row.getAs("clazz"), - mutableSeqAsJavaList(row.>getAs("payload_list")))) - .collect( - Collectors.toMap( - AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue)); + Map> actionSetOafsByType = actionDS + .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) + .select(expr("atomic_action.*")) + .groupBy(col("clazz")) + .agg(collect_list(col("payload")).as("payload_list")) + .collectAsList() + .stream() + .map( + row -> new AbstractMap.SimpleEntry<>( + row. getAs("clazz"), + mutableSeqAsJavaList(row.> getAs("payload_list")))) + .collect( + Collectors + .toMap( + AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue)); - actionSetOafsByType - .keySet() - .forEach( - x -> { - if (oafsByType.containsKey(x)) { - List collected = new ArrayList<>(); - collected.addAll(oafsByType.get(x)); - collected.addAll(actionSetOafsByType.get(x)); - oafsByType.put(x, collected); - } else { - oafsByType.put(x, actionSetOafsByType.get(x)); - } - }); - }); + actionSetOafsByType + .keySet() + .forEach( + x -> { + if (oafsByType.containsKey(x)) { + List collected = new ArrayList<>(); + collected.addAll(oafsByType.get(x)); + collected.addAll(actionSetOafsByType.get(x)); + oafsByType.put(x, collected); + } else { + oafsByType.put(x, actionSetOafsByType.get(x)); + } + }); + }); - return oafsByType; - } + return oafsByType; + } - private static Path getInputActionSetJsonDumpsDir() { - return Paths.get( - Objects.requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) - .getFile()); - } + private static Path getInputActionSetJsonDumpsDir() { + return Paths + .get( + Objects + .requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) + .getFile()); + } - private static Dataset readActionsFromJsonDump(String path) { - return spark.read().textFile(path); - } + private static Dataset readActionsFromJsonDump(String path) { + return spark.read().textFile(path); + } - private static void writeActionsAsJobInput( - Dataset actionDS, String inputActionSetId, String path) { - actionDS - .javaRDD() - .mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json))) - .saveAsNewAPIHadoopFile( - path, Text.class, Text.class, SequenceFileOutputFormat.class, configuration); - } + private static void writeActionsAsJobInput( + Dataset actionDS, String inputActionSetId, String path) { + actionDS + .javaRDD() + .mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json))) + .saveAsNewAPIHadoopFile( + path, Text.class, Text.class, SequenceFileOutputFormat.class, configuration); + } - private static void assertForOafType( - Path outputDir, Map> oafsByClassName, Class clazz) { - Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); - Files.exists(outputDatasetDir); + private static void assertForOafType( + Path outputDir, Map> oafsByClassName, Class clazz) { + Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); + Files.exists(outputDatasetDir); - List actuals = - readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); - actuals.sort(Comparator.comparingInt(Object::hashCode)); + List actuals = readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); + actuals.sort(Comparator.comparingInt(Object::hashCode)); - List expecteds = - oafsByClassName.get(clazz.getCanonicalName()).stream() - .map(json -> mapToOaf(json, clazz)) - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); + List expecteds = oafsByClassName + .get(clazz.getCanonicalName()) + .stream() + .map(json -> mapToOaf(json, clazz)) + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); - assertIterableEquals(expecteds, actuals); - } + assertIterableEquals(expecteds, actuals); + } - private static Dataset readActionPayloadFromJobOutput( - String path, Class clazz) { - return spark - .read() - .parquet(path) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value.getAs("payload"), clazz), - Encoders.bean(clazz)); - } + private static Dataset readActionPayloadFromJobOutput( + String path, Class clazz) { + return spark + .read() + .parquet(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value. getAs("payload"), clazz), + Encoders.bean(clazz)); + } - private static T mapToOaf(String json, Class clazz) { - return rethrowAsRuntimeException( - () -> OBJECT_MAPPER.readValue(json, clazz), - String.format( - "failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName())); - } + private static T mapToOaf(String json, Class clazz) { + return rethrowAsRuntimeException( + () -> OBJECT_MAPPER.readValue(json, clazz), + String + .format( + "failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName())); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java index 0de6f6b4fb..b2248d77aa 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.Strategy; @@ -5,254 +6,252 @@ import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.functionFor; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.function.BiFunction; + import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.*; + public class MergeAndGetTest { - @Nested - class MergeFromAndGetStrategy { + @Nested + class MergeFromAndGetStrategy { - @Test - public void shouldThrowForOafAndOaf() { - // given - Oaf a = mock(Oaf.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForOafAndOaf() { + // given + Oaf a = mock(Oaf.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafAndRelation() { - // given - Oaf a = mock(Oaf.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafAndRelation() { + // given + Oaf a = mock(Oaf.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafAndOafEntity() { - // given - Oaf a = mock(Oaf.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForOafAndOafEntity() { + // given + Oaf a = mock(Oaf.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOaf() { - // given - Relation a = mock(Relation.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForRelationAndOaf() { + // given + Relation a = mock(Relation.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOafEntity() { - // given - Relation a = mock(Relation.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForRelationAndOafEntity() { + // given + Relation a = mock(Relation.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldBehaveProperlyForRelationAndRelation() { - // given - Relation a = mock(Relation.class); - Relation b = mock(Relation.class); + @Test + public void shouldBehaveProperlyForRelationAndRelation() { + // given + Relation a = mock(Relation.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(Relation.class.isAssignableFrom(x.getClass())); - verify(a).mergeFrom(b); - assertEquals(a, x); - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(Relation.class.isAssignableFrom(x.getClass())); + verify(a).mergeFrom(b); + assertEquals(a, x); + } - @Test - public void shouldThrowForOafEntityAndOaf() { - // given - OafEntity a = mock(OafEntity.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForOafEntityAndOaf() { + // given + OafEntity a = mock(OafEntity.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndRelation() { - // given - OafEntity a = mock(OafEntity.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafEntityAndRelation() { + // given + OafEntity a = mock(OafEntity.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { - // given - class OafEntitySub1 extends OafEntity {} - class OafEntitySub2 extends OafEntity {} + @Test + public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { + // given + class OafEntitySub1 extends OafEntity { + } + class OafEntitySub2 extends OafEntity { + } - OafEntitySub1 a = mock(OafEntitySub1.class); - OafEntitySub2 b = mock(OafEntitySub2.class); + OafEntitySub1 a = mock(OafEntitySub1.class); + OafEntitySub2 b = mock(OafEntitySub2.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldBehaveProperlyForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldBehaveProperlyForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - verify(a).mergeFrom(b); - assertEquals(a, x); - } - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + verify(a).mergeFrom(b); + assertEquals(a, x); + } + } - @Nested - class SelectNewerAndGetStrategy { + @Nested + class SelectNewerAndGetStrategy { - @Test - public void shouldThrowForOafEntityAndRelation() { - // given - OafEntity a = mock(OafEntity.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafEntityAndRelation() { + // given + OafEntity a = mock(OafEntity.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOafEntity() { - // given - Relation a = mock(Relation.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForRelationAndOafEntity() { + // given + Relation a = mock(Relation.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndResult() { - // given - OafEntity a = mock(OafEntity.class); - Result b = mock(Result.class); + @Test + public void shouldThrowForOafEntityAndResult() { + // given + OafEntity a = mock(OafEntity.class); + Result b = mock(Result.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { - // given - // real types must be used because subclass-superclass resolution does not work for - // mocks - Dataset a = new Dataset(); - a.setLastupdatetimestamp(1L); - Result b = new Result(); - b.setLastupdatetimestamp(2L); + @Test + public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { + // given + // real types must be used because subclass-superclass resolution does not work for + // mocks + Dataset a = new Dataset(); + a.setLastupdatetimestamp(1L); + Result b = new Result(); + b.setLastupdatetimestamp(2L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldShouldReturnLeftForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - when(a.getLastupdatetimestamp()).thenReturn(1L); - OafEntity b = mock(OafEntity.class); - when(b.getLastupdatetimestamp()).thenReturn(2L); + @Test + public void shouldShouldReturnLeftForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + when(a.getLastupdatetimestamp()).thenReturn(1L); + OafEntity b = mock(OafEntity.class); + when(b.getLastupdatetimestamp()).thenReturn(2L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - assertEquals(b, x); - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + assertEquals(b, x); + } - @Test - public void shouldShouldReturnRightForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - when(a.getLastupdatetimestamp()).thenReturn(2L); - OafEntity b = mock(OafEntity.class); - when(b.getLastupdatetimestamp()).thenReturn(1L); + @Test + public void shouldShouldReturnRightForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + when(a.getLastupdatetimestamp()).thenReturn(2L); + OafEntity b = mock(OafEntity.class); + when(b.getLastupdatetimestamp()).thenReturn(1L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - assertEquals(a, x); - } - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + assertEquals(a, x); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java index e8f8025854..129daadcc8 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.params.provider.Arguments.arguments; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -15,6 +13,7 @@ import java.util.List; import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -26,253 +25,256 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + public class PromoteActionPayloadForGraphTableJobTest { - private static final ClassLoader cl = - PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); + private static final ClassLoader cl = PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private Path workingDir; - private Path inputDir; - private Path inputGraphRootDir; - private Path inputActionPayloadRootDir; - private Path outputDir; + private Path workingDir; + private Path inputDir; + private Path inputGraphRootDir; + private Path inputActionPayloadRootDir; + private Path outputDir; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - conf.setMaster("local"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() { + SparkConf conf = new SparkConf(); + conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + conf.setMaster("local"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @BeforeEach - public void beforeEach() throws IOException { - workingDir = - Files.createTempDirectory(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - inputDir = workingDir.resolve("input"); - inputGraphRootDir = inputDir.resolve("graph"); - inputActionPayloadRootDir = inputDir.resolve("action_payload"); - outputDir = workingDir.resolve("output"); - } + @BeforeEach + public void beforeEach() throws IOException { + workingDir = Files.createTempDirectory(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + inputDir = workingDir.resolve("input"); + inputGraphRootDir = inputDir.resolve("graph"); + inputActionPayloadRootDir = inputDir.resolve("action_payload"); + outputDir = workingDir.resolve("output"); + } - @AfterEach - public void afterEach() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - } + @AfterEach + public void afterEach() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @DisplayName("Job") - @Nested - class Main { + @DisplayName("Job") + @Nested + class Main { - @Test - public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { - // given - Class rowClazz = Relation.class; - Class actionPayloadClazz = OafEntity.class; + @Test + public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { + // given + Class rowClazz = Relation.class; + Class actionPayloadClazz = OafEntity.class; - // when - RuntimeException exception = - assertThrows( - RuntimeException.class, - () -> - PromoteActionPayloadForGraphTableJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputGraphTablePath", - "", - "-graphTableClassName", - rowClazz.getCanonicalName(), - "-inputActionPayloadPath", - "", - "-actionPayloadClassName", - actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", - "", - "-mergeAndGetStrategy", - MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() - })); + // when + RuntimeException exception = assertThrows( + RuntimeException.class, + () -> PromoteActionPayloadForGraphTableJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputGraphTablePath", + "", + "-graphTableClassName", + rowClazz.getCanonicalName(), + "-inputActionPayloadPath", + "", + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", + "", + "-mergeAndGetStrategy", + MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() + })); - // then - String msg = - String.format( - "graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); - assertTrue(exception.getMessage().contains(msg)); - } + // then + String msg = String + .format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + assertTrue(exception.getMessage().contains(msg)); + } - @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") - @MethodSource( - "eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") - public void shouldPromoteActionPayloadForGraphTable( - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) - throws Exception { - // given - Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); - Path inputActionPayloadDir = - createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); - Path outputGraphTableDir = - outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); + @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") + @MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") + public void shouldPromoteActionPayloadForGraphTable( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) + throws Exception { + // given + Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); + Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); + Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); - // when - PromoteActionPayloadForGraphTableJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputGraphTablePath", - inputGraphTableDir.toString(), - "-graphTableClassName", - rowClazz.getCanonicalName(), - "-inputActionPayloadPath", - inputActionPayloadDir.toString(), - "-actionPayloadClassName", - actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", - outputGraphTableDir.toString(), - "-mergeAndGetStrategy", - strategy.name() - }); + // when + PromoteActionPayloadForGraphTableJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputGraphTablePath", + inputGraphTableDir.toString(), + "-graphTableClassName", + rowClazz.getCanonicalName(), + "-inputActionPayloadPath", + inputActionPayloadDir.toString(), + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", + outputGraphTableDir.toString(), + "-mergeAndGetStrategy", + strategy.name() + }); - // then - assertTrue(Files.exists(outputGraphTableDir)); + // then + assertTrue(Files.exists(outputGraphTableDir)); - List actualOutputRows = - readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz).collectAsList() - .stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); - String expectedOutputGraphTableJsonDumpPath = - resultFileLocation(strategy, rowClazz, actionPayloadClazz); - Path expectedOutputGraphTableJsonDumpFile = - Paths.get( - Objects.requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)) - .getFile()); - List expectedOutputRows = - readGraphTableFromJsonDump(expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) - .collectAsList().stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); - assertIterableEquals(expectedOutputRows, actualOutputRows); - } - } + List actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz) + .collectAsList() + .stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + String expectedOutputGraphTableJsonDumpPath = resultFileLocation(strategy, rowClazz, actionPayloadClazz); + Path expectedOutputGraphTableJsonDumpFile = Paths + .get( + Objects + .requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)) + .getFile()); + List expectedOutputRows = readGraphTableFromJsonDump( + expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) + .collectAsList() + .stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + assertIterableEquals(expectedOutputRows, actualOutputRows); + } + } - public static Stream promoteJobTestParams() { - return Stream.of( - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - eu.dnetlib.dhp.schema.oaf.Dataset.class, - eu.dnetlib.dhp.schema.oaf.Dataset.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - eu.dnetlib.dhp.schema.oaf.Dataset.class, - eu.dnetlib.dhp.schema.oaf.Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Datasource.class, Datasource.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Organization.class, Organization.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - OtherResearchProduct.class, - OtherResearchProduct.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Project.class, Project.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Publication.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Relation.class, Relation.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Software.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class)); - } + public static Stream promoteJobTestParams() { + return Stream + .of( + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Dataset.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Datasource.class, Datasource.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Organization.class, Organization.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + OtherResearchProduct.class, + OtherResearchProduct.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Project.class, Project.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Publication.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Relation.class, Relation.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Software.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class)); + } - private static Path createGraphTable(Path inputGraphRootDir, Class rowClazz) { - String inputGraphTableJsonDumpPath = inputGraphTableJsonDumpLocation(rowClazz); - Path inputGraphTableJsonDumpFile = - Paths.get(Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)).getFile()); - Dataset rowDS = readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); - String inputGraphTableName = rowClazz.getSimpleName().toLowerCase(); - Path inputGraphTableDir = inputGraphRootDir.resolve(inputGraphTableName); - writeGraphTableAaJobInput(rowDS, inputGraphTableDir.toString()); - return inputGraphTableDir; - } + private static Path createGraphTable(Path inputGraphRootDir, Class rowClazz) { + String inputGraphTableJsonDumpPath = inputGraphTableJsonDumpLocation(rowClazz); + Path inputGraphTableJsonDumpFile = Paths + .get(Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)).getFile()); + Dataset rowDS = readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); + String inputGraphTableName = rowClazz.getSimpleName().toLowerCase(); + Path inputGraphTableDir = inputGraphRootDir.resolve(inputGraphTableName); + writeGraphTableAaJobInput(rowDS, inputGraphTableDir.toString()); + return inputGraphTableDir; + } - private static String inputGraphTableJsonDumpLocation(Class rowClazz) { - return String.format( - "%s/%s.json", - "eu/dnetlib/dhp/actionmanager/promote/input/graph", rowClazz.getSimpleName().toLowerCase()); - } + private static String inputGraphTableJsonDumpLocation(Class rowClazz) { + return String + .format( + "%s/%s.json", + "eu/dnetlib/dhp/actionmanager/promote/input/graph", rowClazz.getSimpleName().toLowerCase()); + } - private static Dataset readGraphTableFromJsonDump( - String path, Class rowClazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), - Encoders.bean(rowClazz)); - } + private static Dataset readGraphTableFromJsonDump( + String path, Class rowClazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); + } - private static void writeGraphTableAaJobInput(Dataset rowDS, String path) { - rowDS.write().option("compression", "gzip").json(path); - } + private static void writeGraphTableAaJobInput(Dataset rowDS, String path) { + rowDS.write().option("compression", "gzip").json(path); + } - private static Path createActionPayload( - Path inputActionPayloadRootDir, Class rowClazz, Class actionPayloadClazz) { - String inputActionPayloadJsonDumpPath = - inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); - Path inputActionPayloadJsonDumpFile = - Paths.get(Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)).getFile()); - Dataset actionPayloadDS = - readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); - Path inputActionPayloadDir = - inputActionPayloadRootDir.resolve(actionPayloadClazz.getSimpleName().toLowerCase()); - writeActionPayloadAsJobInput(actionPayloadDS, inputActionPayloadDir.toString()); - return inputActionPayloadDir; - } + private static Path createActionPayload( + Path inputActionPayloadRootDir, Class rowClazz, Class actionPayloadClazz) { + String inputActionPayloadJsonDumpPath = inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); + Path inputActionPayloadJsonDumpFile = Paths + .get(Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)).getFile()); + Dataset actionPayloadDS = readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); + Path inputActionPayloadDir = inputActionPayloadRootDir + .resolve(actionPayloadClazz.getSimpleName().toLowerCase()); + writeActionPayloadAsJobInput(actionPayloadDS, inputActionPayloadDir.toString()); + return inputActionPayloadDir; + } - private static String inputActionPayloadJsonDumpLocation( - Class rowClazz, Class actionPayloadClazz) { + private static String inputActionPayloadJsonDumpLocation( + Class rowClazz, Class actionPayloadClazz) { - return String.format( - "eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", - rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); - } + return String + .format( + "eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", + rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); + } - private static Dataset readActionPayloadFromJsonDump(String path) { - return spark.read().textFile(path); - } + private static Dataset readActionPayloadFromJsonDump(String path) { + return spark.read().textFile(path); + } - private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, String path) { - actionPayloadDS.withColumnRenamed("value", "payload").write().parquet(path); - } + private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, String path) { + actionPayloadDS.withColumnRenamed("value", "payload").write().parquet(path); + } - private static Dataset readGraphTableFromJobOutput( - String path, Class rowClazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), - Encoders.bean(rowClazz)); - } + private static Dataset readGraphTableFromJobOutput( + String path, Class rowClazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); + } - private static String resultFileLocation( - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - return String.format( - "eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", - strategy.name().toLowerCase(), - rowClazz.getSimpleName().toLowerCase(), - actionPayloadClazz.getSimpleName().toLowerCase()); - } + private static String resultFileLocation( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + return String + .format( + "eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", + strategy.name().toLowerCase(), + rowClazz.getSimpleName().toLowerCase(), + actionPayloadClazz.getSimpleName().toLowerCase()); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java index 9abb0858fa..477e4b204a 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java @@ -1,15 +1,15 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -19,314 +19,311 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.Oaf; + public class PromoteActionPayloadFunctionsTest { - private static SparkSession spark; + private static SparkSession spark; - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setMaster("local"); - conf.setAppName(PromoteActionPayloadFunctionsTest.class.getSimpleName()); - conf.set("spark.driver.host", "localhost"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() { + SparkConf conf = new SparkConf(); + conf.setMaster("local"); + conf.setAppName(PromoteActionPayloadFunctionsTest.class.getSimpleName()); + conf.set("spark.driver.host", "localhost"); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @Nested - class JoinTableWithActionPayloadAndMerge { + @Nested + class JoinTableWithActionPayloadAndMerge { - @Test - public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { - // given - class OafImpl extends Oaf {} + @Test + public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { + // given + class OafImpl extends Oaf { + } - // when - assertThrows( - RuntimeException.class, - () -> - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - null, null, null, null, null, OafImplSubSub.class, OafImpl.class)); - } + // when + assertThrows( + RuntimeException.class, + () -> PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + null, null, null, null, null, OafImplSubSub.class, OafImpl.class)); + } - @Test - public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { - // given - String id0 = "id0"; - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - String id4 = "id4"; - List rowData = - Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { + // given + String id0 = "id0"; + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + String id4 = "id4"; + List rowData = Arrays + .asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = - Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id2), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id4), - createOafImplSubSub(id4), - createOafImplSubSub(id4), - createOafImplSubSub(id4)); - Dataset actionPayloadDS = - spark.createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); + List actionPayloadData = Arrays + .asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4)); + Dataset actionPayloadDS = spark + .createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = - () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, + y) -> { + x.merge(y); + return x; + }; - // when - List results = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSubSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSubSub.class) + .collectAsList(); - // then - assertEquals(11, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); - assertEquals(4, results.stream().filter(x -> x.getId().equals(id4)).count()); + // then + assertEquals(11, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); + assertEquals(4, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - case "id4": - assertEquals(1, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + case "id4": + assertEquals(1, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } - @Test - public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { - // given - String id0 = "id0"; - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - String id4 = "id4"; - List rowData = - Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { + // given + String id0 = "id0"; + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + String id4 = "id4"; + List rowData = Arrays + .asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = - Arrays.asList( - createOafImplSub(id1), - createOafImplSub(id2), - createOafImplSub(id2), - createOafImplSub(id3), - createOafImplSub(id3), - createOafImplSub(id3), - createOafImplSub(id4), - createOafImplSub(id4), - createOafImplSub(id4), - createOafImplSub(id4)); - Dataset actionPayloadDS = - spark.createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); + List actionPayloadData = Arrays + .asList( + createOafImplSub(id1), + createOafImplSub(id2), + createOafImplSub(id2), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4)); + Dataset actionPayloadDS = spark + .createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = - () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, y) -> { + x.merge(y); + return x; + }; - // when - List results = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSub.class) + .collectAsList(); - // then - assertEquals(7, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); - assertEquals(0, results.stream().filter(x -> x.getId().equals(id4)).count()); + // then + assertEquals(7, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); + assertEquals(0, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } + } - @Nested - class GroupTableByIdAndMerge { + @Nested + class GroupTableByIdAndMerge { - @Test - public void shouldRunProperly() { - // given - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - List rowData = - Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id2), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperly() { + // given + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + List rowData = Arrays + .asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; - SerializableSupplier zeroFn = OafImplSubSub::new; - SerializableSupplier> isNotZeroFn = - () -> x -> Objects.nonNull(x.getId()); + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, + y) -> { + x.merge(y); + return x; + }; + SerializableSupplier zeroFn = OafImplSubSub::new; + SerializableSupplier> isNotZeroFn = () -> x -> Objects.nonNull(x.getId()); - // when - List results = - PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( - rowDS, rowIdFn, mergeAndGetFn, zeroFn, isNotZeroFn, OafImplSubSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + rowDS, rowIdFn, mergeAndGetFn, zeroFn, isNotZeroFn, OafImplSubSub.class) + .collectAsList(); - // then - assertEquals(3, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id3)).count()); + // then + assertEquals(3, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id3)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id1": - assertEquals(1, result.getMerged()); - break; - case "id2": - assertEquals(2, result.getMerged()); - break; - case "id3": - assertEquals(3, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id1": + assertEquals(1, result.getMerged()); + break; + case "id2": + assertEquals(2, result.getMerged()); + break; + case "id3": + assertEquals(3, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } + } - public static class OafImplRoot extends Oaf { - private String id; - private int merged = 1; + public static class OafImplRoot extends Oaf { + private String id; + private int merged = 1; - public void merge(OafImplRoot e) { - merged += e.merged; - } + public void merge(OafImplRoot e) { + merged += e.merged; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public int getMerged() { - return merged; - } + public int getMerged() { + return merged; + } - public void setMerged(int merged) { - this.merged = merged; - } - } + public void setMerged(int merged) { + this.merged = merged; + } + } - public static class OafImplSub extends OafImplRoot { + public static class OafImplSub extends OafImplRoot { - @Override - public void merge(OafImplRoot e) { - super.merge(e); - } - } + @Override + public void merge(OafImplRoot e) { + super.merge(e); + } + } - private static OafImplSub createOafImplSub(String id) { - OafImplSub x = new OafImplSub(); - x.setId(id); - return x; - } + private static OafImplSub createOafImplSub(String id) { + OafImplSub x = new OafImplSub(); + x.setId(id); + return x; + } - public static class OafImplSubSub extends OafImplSub { + public static class OafImplSubSub extends OafImplSub { - @Override - public void merge(OafImplRoot e) { - super.merge(e); - } - } + @Override + public void merge(OafImplRoot e) { + super.merge(e); + } + } - private static OafImplSubSub createOafImplSubSub(String id) { - OafImplSubSub x = new OafImplSubSub(); - x.setId(id); - return x; - } + private static OafImplSubSub createOafImplSubSub(String id) { + OafImplSubSub x = new OafImplSubSub(); + x.setId(id); + return x; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 526bff2e1b..9811fb7073 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,17 +1,12 @@ + package eu.dnetlib.dhp.collection; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; import java.util.Objects; + import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -29,127 +24,138 @@ import org.dom4j.Document; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class GenerateNativeStoreSparkJob { - public static MetadataRecord parseRecord( - final String input, - final String xpath, - final String encoding, - final Provenance provenance, - final Long dateOfCollection, - final LongAccumulator totalItems, - final LongAccumulator invalidRecords) { + public static MetadataRecord parseRecord( + final String input, + final String xpath, + final String encoding, + final Provenance provenance, + final Long dateOfCollection, + final LongAccumulator totalItems, + final LongAccumulator invalidRecords) { - if (totalItems != null) totalItems.add(1); - try { - SAXReader reader = new SAXReader(); - Document document = - reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); - Node node = document.selectSingleNode(xpath); - final String originalIdentifier = node.getText(); - if (StringUtils.isBlank(originalIdentifier)) { - if (invalidRecords != null) invalidRecords.add(1); - return null; - } - return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); - } catch (Throwable e) { - if (invalidRecords != null) invalidRecords.add(1); - e.printStackTrace(); - return null; - } - } + if (totalItems != null) + totalItems.add(1); + try { + SAXReader reader = new SAXReader(); + Document document = reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); + Node node = document.selectSingleNode(xpath); + final String originalIdentifier = node.getText(); + if (StringUtils.isBlank(originalIdentifier)) { + if (invalidRecords != null) + invalidRecords.add(1); + return null; + } + return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); + } catch (Throwable e) { + if (invalidRecords != null) + invalidRecords.add(1); + e.printStackTrace(); + return null; + } + } - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - GenerateNativeStoreSparkJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); - parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + parser.parseArgument(args); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final SparkSession spark = - SparkSession.builder() - .appName("GenerateNativeStoreSparkJob") - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = SparkSession + .builder() + .appName("GenerateNativeStoreSparkJob") + .master(parser.get("master")) + .getOrCreate(); - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); + final Map ongoingMap = new HashMap<>(); + final Map reportMap = new HashMap<>(); - final boolean test = - parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final JavaPairRDD inputRDD = - sc.sequenceFile(parser.get("input"), IntWritable.class, Text.class); + final JavaPairRDD inputRDD = sc + .sequenceFile(parser.get("input"), IntWritable.class, Text.class); - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); + final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - final MessageManager manager = - new MessageManager( - parser.get("rabbitHost"), - parser.get("rabbitUser"), - parser.get("rabbitPassword"), - false, - false, - null); + final MessageManager manager = new MessageManager( + parser.get("rabbitHost"), + parser.get("rabbitUser"), + parser.get("rabbitPassword"), + false, + false, + null); - final JavaRDD mappeRDD = - inputRDD - .map( - item -> - parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); + final JavaRDD mappeRDD = inputRDD + .map( + item -> parseRecord( + item._2().toString(), + parser.get("xpath"), + parser.get("encoding"), + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); - ongoingMap.put("ongoing", "0"); - if (!test) { - manager.sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } + ongoingMap.put("ongoing", "0"); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "" + totalItems.value()); - if (!test) { - manager.sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + invalidRecords.value()); - reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); - if (!test) { - manager.sendMessage( - new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - parser.get("rabbitReportQueue"), - true, - false); - manager.close(); - } - } + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); + final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); + mdStoreRecords.add(mdstore.count()); + ongoingMap.put("ongoing", "" + totalItems.value()); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } + mdstore.write().format("parquet").save(parser.get("output")); + reportMap.put("inputItem", "" + totalItems.value()); + reportMap.put("invalidRecords", "" + invalidRecords.value()); + reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); + if (!test) { + manager + .sendMessage( + new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + parser.get("rabbitReportQueue"), + true, + false); + manager.close(); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 82f28afe64..4a0c70c459 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -1,10 +1,12 @@ + package eu.dnetlib.dhp.collection.plugin; +import java.util.stream.Stream; + import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import java.util.stream.Stream; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws DnetCollectorException; + Stream collect(ApiDescriptor api) throws DnetCollectorException; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 415102a1a8..7f71f401d9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.collection.plugin.oai; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -14,65 +9,74 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; + public class OaiCollectorPlugin implements CollectorPlugin { - private static final String FORMAT_PARAM = "format"; - private static final String OAI_SET_PARAM = "set"; - private static final Object OAI_FROM_DATE_PARAM = "fromDate"; - private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; + private static final String FORMAT_PARAM = "format"; + private static final String OAI_SET_PARAM = "set"; + private static final Object OAI_FROM_DATE_PARAM = "fromDate"; + private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; - private OaiIteratorFactory oaiIteratorFactory; + private OaiIteratorFactory oaiIteratorFactory; - @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { - final String baseUrl = api.getBaseUrl(); - final String mdFormat = api.getParams().get(FORMAT_PARAM); - final String setParam = api.getParams().get(OAI_SET_PARAM); - final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); - final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); + @Override + public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + final String baseUrl = api.getBaseUrl(); + final String mdFormat = api.getParams().get(FORMAT_PARAM); + final String setParam = api.getParams().get(OAI_SET_PARAM); + final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); + final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); - final List sets = new ArrayList<>(); - if (setParam != null) { - sets.addAll( - Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); - } - if (sets.isEmpty()) { - // If no set is defined, ALL the sets must be harvested - sets.add(""); - } + final List sets = new ArrayList<>(); + if (setParam != null) { + sets + .addAll( + Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); + } + if (sets.isEmpty()) { + // If no set is defined, ALL the sets must be harvested + sets.add(""); + } - if (baseUrl == null || baseUrl.isEmpty()) { - throw new DnetCollectorException("Param 'baseurl' is null or empty"); - } + if (baseUrl == null || baseUrl.isEmpty()) { + throw new DnetCollectorException("Param 'baseurl' is null or empty"); + } - if (mdFormat == null || mdFormat.isEmpty()) { - throw new DnetCollectorException("Param 'mdFormat' is null or empty"); - } + if (mdFormat == null || mdFormat.isEmpty()) { + throw new DnetCollectorException("Param 'mdFormat' is null or empty"); + } - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); - } + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); + } - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); - } + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); + } - final Iterator> iters = - sets.stream() - .map( - set -> - getOaiIteratorFactory() - .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) - .iterator(); + final Iterator> iters = sets + .stream() + .map( + set -> getOaiIteratorFactory() + .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .iterator(); - return StreamSupport.stream( - Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); - } + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); + } - public OaiIteratorFactory getOaiIteratorFactory() { - if (oaiIteratorFactory == null) { - oaiIteratorFactory = new OaiIteratorFactory(); - } - return oaiIteratorFactory; - } + public OaiIteratorFactory getOaiIteratorFactory() { + if (oaiIteratorFactory == null) { + oaiIteratorFactory = new OaiIteratorFactory(); + } + return oaiIteratorFactory; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 13c40de06b..d61f13fb5f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -1,14 +1,13 @@ + package eu.dnetlib.dhp.collection.plugin.oai; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; -import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; + import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -17,160 +16,162 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; + public class OaiIterator implements Iterator { - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on - // 11/24/08 5:02 PM + private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on + // 11/24/08 5:02 PM - private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); + private final Queue queue = new PriorityBlockingQueue<>(); + private final SAXReader reader = new SAXReader(); - private final String baseUrl; - private final String set; - private final String mdFormat; - private final String fromDate; - private final String untilDate; - private String token; - private boolean started; - private final HttpConnector httpConnector; + private final String baseUrl; + private final String set; + private final String mdFormat; + private final String fromDate; + private final String untilDate; + private String token; + private boolean started; + private final HttpConnector httpConnector; - public OaiIterator( - final String baseUrl, - final String mdFormat, - final String set, - final String fromDate, - final String untilDate, - final HttpConnector httpConnector) { - this.baseUrl = baseUrl; - this.mdFormat = mdFormat; - this.set = set; - this.fromDate = fromDate; - this.untilDate = untilDate; - this.started = false; - this.httpConnector = httpConnector; - } + public OaiIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate, + final HttpConnector httpConnector) { + this.baseUrl = baseUrl; + this.mdFormat = mdFormat; + this.set = set; + this.fromDate = fromDate; + this.untilDate = untilDate; + this.started = false; + this.httpConnector = httpConnector; + } - private void verifyStarted() { - if (!this.started) { - this.started = true; - try { - this.token = firstPage(); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - } + private void verifyStarted() { + if (!this.started) { + this.started = true; + try { + this.token = firstPage(); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + } - @Override - public boolean hasNext() { - synchronized (queue) { - verifyStarted(); - return !queue.isEmpty(); - } - } + @Override + public boolean hasNext() { + synchronized (queue) { + verifyStarted(); + return !queue.isEmpty(); + } + } - @Override - public String next() { - synchronized (queue) { - verifyStarted(); - final String res = queue.poll(); - while (queue.isEmpty() && token != null && !token.isEmpty()) { - try { - token = otherPages(token); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - return res; - } - } + @Override + public String next() { + synchronized (queue) { + verifyStarted(); + final String res = queue.poll(); + while (queue.isEmpty() && token != null && !token.isEmpty()) { + try { + token = otherPages(token); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + return res; + } + } - @Override - public void remove() {} + @Override + public void remove() { + } - private String firstPage() throws DnetCollectorException { - try { - String url = - baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); - if (set != null && !set.isEmpty()) { - url += "&set=" + URLEncoder.encode(set, "UTF-8"); - } - if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); - } - if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); - } - log.info("Start harvesting using url: " + url); + private String firstPage() throws DnetCollectorException { + try { + String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); + if (set != null && !set.isEmpty()) { + url += "&set=" + URLEncoder.encode(set, "UTF-8"); + } + if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); + } + if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); + } + log.info("Start harvesting using url: " + url); - return downloadPage(url); - } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); - } - } + return downloadPage(url); + } catch (final UnsupportedEncodingException e) { + throw new DnetCollectorException(e); + } + } - private String extractResumptionToken(final String xml) { + private String extractResumptionToken(final String xml) { - final String s = StringUtils.substringAfter(xml, "", "", " newIterator( - final String baseUrl, - final String mdFormat, - final String set, - final String fromDate, - final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); - } + public Iterator newIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); + } - private HttpConnector getHttpConnector() { - if (httpConnector == null) httpConnector = new HttpConnector(); - return httpConnector; - } + private HttpConnector getHttpConnector() { + if (httpConnector == null) + httpConnector = new HttpConnector(); + return httpConnector; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java index 320f735b35..f40962c217 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java @@ -1,31 +1,32 @@ + package eu.dnetlib.dhp.collection.worker; public class DnetCollectorException extends Exception { - /** */ - private static final long serialVersionUID = -290723075076039757L; + /** */ + private static final long serialVersionUID = -290723075076039757L; - public DnetCollectorException() { - super(); - } + public DnetCollectorException() { + super(); + } - public DnetCollectorException( - final String message, - final Throwable cause, - final boolean enableSuppression, - final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } + public DnetCollectorException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } - public DnetCollectorException(final String message, final Throwable cause) { - super(message, cause); - } + public DnetCollectorException(final String message, final Throwable cause) { + super(message, cause); + } - public DnetCollectorException(final String message) { - super(message); - } + public DnetCollectorException(final String message) { + super(message); + } - public DnetCollectorException(final Throwable cause) { - super(cause); - } + public DnetCollectorException(final Throwable cause) { + super(cause); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java index d76ec8e378..e686ad5180 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java @@ -1,18 +1,12 @@ + package eu.dnetlib.dhp.collection.worker; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.IOException; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -22,111 +16,124 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class DnetCollectorWorker { - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - private final CollectorPluginFactory collectorPluginFactory; + private final CollectorPluginFactory collectorPluginFactory; - private final ArgumentApplicationParser argumentParser; + private final ArgumentApplicationParser argumentParser; - private final MessageManager manager; + private final MessageManager manager; - public DnetCollectorWorker( - final CollectorPluginFactory collectorPluginFactory, - final ArgumentApplicationParser argumentParser, - final MessageManager manager) - throws DnetCollectorException { - this.collectorPluginFactory = collectorPluginFactory; - this.argumentParser = argumentParser; - this.manager = manager; - } + public DnetCollectorWorker( + final CollectorPluginFactory collectorPluginFactory, + final ArgumentApplicationParser argumentParser, + final MessageManager manager) + throws DnetCollectorException { + this.collectorPluginFactory = collectorPluginFactory; + this.argumentParser = argumentParser; + this.manager = manager; + } - public void collect() throws DnetCollectorException { - try { - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = - jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); + public void collect() throws DnetCollectorException { + try { + final ObjectMapper jsonMapper = new ObjectMapper(); + final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); + final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); - final String hdfsuri = argumentParser.get("namenode"); + final String hdfsuri = argumentParser.get("namenode"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); + System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); + System.setProperty("hadoop.home.dir", "/"); + // Get the filesystem - HDFS + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); - log.info("Created path " + hdfswritepath.toString()); + log.info("Created path " + hdfswritepath.toString()); - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log.debug( - "Sending message: " - + manager.sendMessage( - new Message( - argumentParser.get("workflowId"), - "Collection", - MessageType.ONGOING, - ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - ongoingMap.put("ongoing", "" + counter.get()); - manager.sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - reportMap.put("collected", "" + counter.get()); - manager.sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - manager.close(); - } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ", e); - } - } + final Map ongoingMap = new HashMap<>(); + final Map reportMap = new HashMap<>(); + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + plugin + .collect(api) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + if (counter.get() % 10 == 0) { + try { + ongoingMap.put("ongoing", "" + counter.get()); + log + .debug( + "Sending message: " + + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), + "Collection", + MessageType.ONGOING, + ongoingMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false)); + } catch (Exception e) { + log.error("Error on sending message ", e); + } + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + ongoingMap.put("ongoing", "" + counter.get()); + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); + reportMap.put("collected", "" + counter.get()); + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); + manager.close(); + } catch (Throwable e) { + throw new DnetCollectorException("Error on collecting ", e); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java index 7ff61d6774..cda07d1515 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java @@ -1,48 +1,49 @@ + package eu.dnetlib.dhp.collection.worker; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.MessageManager; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.message.MessageManager; + /** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into - * HDFS. This module will be executed on the hadoop cluster and taking in input some parameters that - * tells it which is the right collector plugin to use and where store the data into HDFS path + * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module + * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector + * plugin to use and where store the data into HDFS path * * @author Sandro La Bruzzo */ public class DnetCollectorWorkerApplication { - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - private static CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); + private static CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - private static ArgumentApplicationParser argumentParser; + private static ArgumentApplicationParser argumentParser; - /** @param args */ - public static void main(final String[] args) throws Exception { + /** @param args */ + public static void main(final String[] args) throws Exception { - argumentParser = - new ArgumentApplicationParser( - IOUtils.toString( - DnetCollectorWorker.class.getResourceAsStream( - "/eu/dnetlib/collector/worker/collector_parameter.json"))); - argumentParser.parseArgument(args); - log.info("hdfsPath =" + argumentParser.get("hdfsPath")); - log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = - new MessageManager( - argumentParser.get("rabbitHost"), - argumentParser.get("rabbitUser"), - argumentParser.get("rabbitPassword"), - false, - false, - null); - final DnetCollectorWorker worker = - new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); - worker.collect(); - } + argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + DnetCollectorWorker.class + .getResourceAsStream( + "/eu/dnetlib/collector/worker/collector_parameter.json"))); + argumentParser.parseArgument(args); + log.info("hdfsPath =" + argumentParser.get("hdfsPath")); + log.info("json = " + argumentParser.get("apidescriptor")); + final MessageManager manager = new MessageManager( + argumentParser.get("rabbitHost"), + argumentParser.get("rabbitUser"), + argumentParser.get("rabbitPassword"), + false, + false, + null); + final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); + worker.collect(); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java index 27d9827969..6ee8a8b496 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java @@ -1,18 +1,19 @@ + package eu.dnetlib.dhp.collection.worker.utils; import java.util.LinkedList; public class CollectorPluginErrorLogList extends LinkedList { - private static final long serialVersionUID = -6925786561303289704L; + private static final long serialVersionUID = -6925786561303289704L; - @Override - public String toString() { - String log = new String(); - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } + @Override + public String toString() { + String log = new String(); + int index = 0; + for (final String errorMessage : this) { + log += String.format("Retry #%s: %s / ", index++, errorMessage); + } + return log; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index 8572001191..7a0028e793 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.collection.worker.utils; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; @@ -6,13 +7,14 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorException; public class CollectorPluginFactory { - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { - if (protocol == null) throw new DnetCollectorException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()) { - case "oai": - return new OaiCollectorPlugin(); - default: - throw new DnetCollectorException("UNknown protocol"); - } - } + public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { + if (protocol == null) + throw new DnetCollectorException("protocol cannot be null"); + switch (protocol.toLowerCase().trim()) { + case "oai": + return new OaiCollectorPlugin(); + default: + throw new DnetCollectorException("UNknown protocol"); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index 36b08008ab..5d6108fad8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -1,6 +1,6 @@ + package eu.dnetlib.dhp.collection.worker.utils; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import java.io.IOException; import java.io.InputStream; import java.net.*; @@ -8,226 +8,237 @@ import java.security.GeneralSecurityException; import java.security.cert.X509Certificate; import java.util.List; import java.util.Map; + import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; + public class HttpConnector { - private static final Log log = LogFactory.getLog(HttpConnector.class); + private static final Log log = LogFactory.getLog(HttpConnector.class); - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds - private String responseType = null; + private String responseType = null; - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } + public HttpConnector() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl) throws DnetCollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content as a stream via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource as InputStream + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { + return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + } - private String attemptDownlaodAsString( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + private String attemptDownlaodAsString( + final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + try { + final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private InputStream attemptDownload( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { + private InputStream attemptDownload( + final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { - if (retryNumber > maxNumberOfRetry) { - throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); - } + if (retryNumber > maxNumberOfRetry) { + throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); + } - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); + try { + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM - || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList.add( - String.format( - "%s %s. Moved to: %s", - urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log.error( - String.format( - "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add( - String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { + log.warn("waiting and repeating request after " + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + errorList.add("503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM + || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug("The requested url has been moved to " + newUrl); + errorList + .add( + String + .format( + "%s %s. Moved to: %s", + urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); + urlConn.disconnect(); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { + log + .error( + String + .format( + "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + Thread.sleep(defaultDelay * 1000); + errorList + .add( + String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } + for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (final String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null - && key.toLowerCase().equals("retry-after") - && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { - return Integer.parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } + private int obtainRetryAfter(final Map> headerMap) { + for (final String key : headerMap.keySet()) { + if (key != null + && key.toLowerCase().equals("retry-after") + && headerMap.get(key).size() > 0 + && NumberUtils.isNumber(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } - private String obtainNewLocation(final Map> headerMap) - throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { - return headerMap.get(key).get(0); - } - } - throw new DnetCollectorException( - "The requested url has been MOVED, but 'location' param is MISSING"); - } + private String obtainNewLocation(final Map> headerMap) + throws DnetCollectorException { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { + return headerMap.get(key).get(0); + } + } + throw new DnetCollectorException( + "The requested url has been MOVED, but 'location' param is MISSING"); + } - /** - * register for https scheme; this is a workaround and not intended for the use in trusted - * environments - */ - public void initTrustManager() { - final X509TrustManager tm = - new X509TrustManager() { + /** + * register for https scheme; this is a workaround and not intended for the use in trusted environments + */ + public void initTrustManager() { + final X509TrustManager tm = new X509TrustManager() { - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkClientTrusted(final X509Certificate[] xcs, final String string) { + } - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkServerTrusted(final X509Certificate[] xcs, final String string) { + } - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] {tm}, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] { + tm + }, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (final GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } - public int getDefaultDelay() { - return defaultDelay; - } + public int getDefaultDelay() { + return defaultDelay; + } - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } - public int getReadTimeOut() { - return readTimeOut; - } + public int getReadTimeOut() { + return readTimeOut; + } - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } - public String getResponseType() { - return responseType; - } + public String getResponseType() { + return responseType; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java index 6a9afd591b..32eeeab4b8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.collection.worker.utils; import java.util.HashMap; @@ -9,376 +10,374 @@ import java.util.regex.Pattern; /** @author jochen, Andreas Czerniak */ public class XmlCleaner { - /** Pattern for numeric entities. */ - private static Pattern validCharacterEntityPattern = - Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ - // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); - // //$NON-NLS-1$ + /** Pattern for numeric entities. */ + private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); + // //$NON-NLS-1$ - // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to - private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); - /** - * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | - * [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - */ - private static Pattern invalidCharacterPattern = - Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | + * [#xE000-#xFFFD] | [#x10000-#x10FFFF] + */ + private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ - // Map entities to their unicode equivalent - private static Set goodEntities = new HashSet<>(); - private static Map badEntities = new HashMap<>(); + // Map entities to their unicode equivalent + private static Set goodEntities = new HashSet<>(); + private static Map badEntities = new HashMap<>(); - static { - // pre-defined XML entities - goodEntities.add("""); // $NON-NLS-1$ // quotation mark - goodEntities.add("&"); // $NON-NLS-1$ // ampersand - goodEntities.add("<"); // $NON-NLS-1$ // less-than sign - goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign - // control entities - // badEntities.put(" ", ""); - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - // misc entities - badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro - badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation - // mark - badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation - // mark - // Latin 1 entities - badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space - badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation - // mark - badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign - badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign - badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign - badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign - badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar - badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign - badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis - badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign - badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal - // indicator - badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle - // quotation mark - badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign - badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen - badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign - badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron - badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign - badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign - badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two - badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three - badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent - badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign - badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign - badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot - badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla - badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one - badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal - // indicator - badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double - // angle quotation - // mark - badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // quarter - badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // half - badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three - // quarters - badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question - // mark - badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with grave - badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with acute - badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with circumflex - badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with tilde - badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with - // diaeresis - badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with ring above - badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // AE - badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // C - // with cedilla - badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with grave - badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with acute - badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with circumflex - badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with - // diaeresis - badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with grave - badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with acute - badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with circumflex - badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with - // diaeresis - badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH - badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // N - // with tilde - badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with grave - badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with acute - badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with circumflex - badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with tilde - badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with - // diaeresis - badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign - badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with stroke - badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with grave - badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with acute - badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with circumflex - badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with - // diaeresis - badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // Y - // with acute - badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // THORN - badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // sharp s - badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // grave - badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // acute - badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // circumflex - badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // tilde - badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // diaeresis - badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // ring above - badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae - badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c - // with - // cedilla - badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // grave - badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // acute - badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // circumflex - badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // diaeresis - badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // grave - badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // acute - badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // circumflex - badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // diaeresis - badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth - badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n - // with - // tilde - badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // grave - badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // acute - badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // circumflex - badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // tilde - badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // diaeresis - badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign - badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // stroke - badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // grave - badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // acute - badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // circumflex - badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // diaeresis - badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // acute - badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // thorn - badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // diaeresis - } + static { + // pre-defined XML entities + goodEntities.add("""); // $NON-NLS-1$ // quotation mark + goodEntities.add("&"); // $NON-NLS-1$ // ampersand + goodEntities.add("<"); // $NON-NLS-1$ // less-than sign + goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation + // mark + badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation + // mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation + // mark + badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal + // indicator + badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle + // quotation mark + badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal + // indicator + badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double + // angle quotation + // mark + badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // quarter + badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // half + badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three + // quarters + badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question + // mark + badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with grave + badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with acute + badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with circumflex + badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with tilde + badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with + // diaeresis + badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with ring above + badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // AE + badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // C + // with cedilla + badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with grave + badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with acute + badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with circumflex + badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with + // diaeresis + badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with grave + badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with acute + badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with circumflex + badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with + // diaeresis + badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // N + // with tilde + badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with grave + badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with acute + badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with circumflex + badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with tilde + badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with + // diaeresis + badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with stroke + badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with grave + badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with acute + badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with circumflex + badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with + // diaeresis + badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // Y + // with acute + badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // THORN + badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // sharp s + badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // grave + badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // acute + badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // circumflex + badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // tilde + badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // diaeresis + badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // ring above + badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c + // with + // cedilla + badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // grave + badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // acute + badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // circumflex + badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // diaeresis + badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // grave + badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // acute + badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // circumflex + badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // diaeresis + badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n + // with + // tilde + badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // grave + badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // acute + badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // circumflex + badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // tilde + badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // diaeresis + badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // stroke + badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // grave + badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // acute + badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // circumflex + badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // diaeresis + badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // acute + badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // thorn + badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // diaeresis + } - /** - * For each entity in the input that is not allowed in XML, replace the entity with its unicode - * equivalent or remove it. For each instance of a bare {@literal &}, replace it with {@literal - * &
} XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal - * &lt;} and {@literal &gt;}. - * - * @param broken the string to handle entities - * @return the string with entities appropriately fixed up - */ - public static String cleanAllEntities(final String broken) { - if (broken == null) { - return null; - } + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove + * it. For each instance of a bare {@literal &}, replace it with {@literal + * &
+ } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal + * &lt;} and {@literal &gt;}. + * + * @param broken the string to handle entities + * @return the string with entities appropriately fixed up + */ + public static String cleanAllEntities(final String broken) { + if (broken == null) { + return null; + } - String working = invalidControlCharPattern.matcher(broken).replaceAll(""); - working = invalidCharacterPattern.matcher(working).replaceAll(""); + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); - int cleanfrom = 0; + int cleanfrom = 0; - while (true) { - int amp = working.indexOf('&', cleanfrom); - // If there are no more amps then we are done - if (amp == -1) { - break; - } - // Skip references of the kind &#ddd; - if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { - cleanfrom = working.indexOf(';', amp) + 1; - continue; - } - int i = amp + 1; - while (true) { - // if we are at the end of the string then just escape the '&'; - if (i >= working.length()) { - return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - } - // if we have come to a ; then we have an entity - // If it is something that xml can't handle then replace it. - final char c = working.charAt(i); - if (c == ';') { - final String entity = working.substring(amp, i + 1); - final String replace = handleEntity(entity); - working = working.substring(0, amp) + replace + working.substring(i + 1); - break; - } - // Did we end an entity without finding a closing ; - // Then treat it as an '&' that needs to be replaced with & - if (!Character.isLetterOrDigit(c)) { - working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - amp = i + 4; // account for the 4 extra characters - break; - } - i++; - } - cleanfrom = amp + 1; - } + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { + return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } - if (Pattern.compile("<<").matcher(working).find()) { - working = working.replaceAll("<<", "<<"); - } + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } - if (Pattern.compile(">>").matcher(working).find()) { - working = working.replaceAll(">>", ">>"); - } + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } - return working; - } + return working; + } - /** - * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it - * out. XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. - * - * @param entity the entity to be replaced - * @return the substitution for the entity, either itself, the unicode equivalent or an empty - * string. - */ - private static String handleEntity(final String entity) { - if (goodEntities.contains(entity)) { - return entity; - } + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only + * allows 4 entities: &amp;, &quot;, &lt; and &gt;. + * + * @param entity the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { + return entity; + } - final String replace = badEntities.get(entity); - if (replace != null) { - return replace; - } + final String replace = badEntities.get(entity); + if (replace != null) { + return replace; + } - return replace != null ? replace : ""; - } + return replace != null ? replace : ""; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java index c568714de7..f4bf78e189 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java @@ -1,71 +1,74 @@ + package eu.dnetlib.dhp.transformation; +import java.io.ByteArrayInputStream; +import java.io.StringWriter; +import java.util.Map; + +import javax.xml.transform.stream.StreamSource; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.util.LongAccumulator; + import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.functions.Cleaner; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import java.io.ByteArrayInputStream; -import java.io.StringWriter; -import java.util.Map; -import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.*; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.util.LongAccumulator; public class TransformFunction implements MapFunction { - private final LongAccumulator totalItems; - private final LongAccumulator errorItems; - private final LongAccumulator transformedItems; - private final String transformationRule; - private final Cleaner cleanFunction; + private final LongAccumulator totalItems; + private final LongAccumulator errorItems; + private final LongAccumulator transformedItems; + private final String transformationRule; + private final Cleaner cleanFunction; - private final long dateOfTransformation; + private final long dateOfTransformation; - public TransformFunction( - LongAccumulator totalItems, - LongAccumulator errorItems, - LongAccumulator transformedItems, - final String transformationRule, - long dateOfTransformation, - final Map vocabularies) - throws Exception { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.transformedItems = transformedItems; - this.transformationRule = transformationRule; - this.dateOfTransformation = dateOfTransformation; - cleanFunction = new Cleaner(vocabularies); - } + public TransformFunction( + LongAccumulator totalItems, + LongAccumulator errorItems, + LongAccumulator transformedItems, + final String transformationRule, + long dateOfTransformation, + final Map vocabularies) + throws Exception { + this.totalItems = totalItems; + this.errorItems = errorItems; + this.transformedItems = transformedItems; + this.transformationRule = transformationRule; + this.dateOfTransformation = dateOfTransformation; + cleanFunction = new Cleaner(vocabularies); + } - @Override - public MetadataRecord call(MetadataRecord value) { - totalItems.add(1); - try { - Processor processor = new Processor(false); - processor.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = processor.newXsltCompiler(); - XsltExecutable xslt = - comp.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); - XdmNode source = - processor - .newDocumentBuilder() - .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); - XsltTransformer trans = xslt.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = processor.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - final String xml = output.toString(); - value.setBody(xml); - value.setDateOfTransformation(dateOfTransformation); - transformedItems.add(1); - return value; - } catch (Throwable e) { - errorItems.add(1); - return null; - } - } + @Override + public MetadataRecord call(MetadataRecord value) { + totalItems.add(1); + try { + Processor processor = new Processor(false); + processor.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = processor.newXsltCompiler(); + XsltExecutable xslt = comp + .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); + XdmNode source = processor + .newDocumentBuilder() + .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); + XsltTransformer trans = xslt.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = processor.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + final String xml = output.toString(); + value.setBody(xml); + value.setDateOfTransformation(dateOfTransformation); + transformedItems.add(1); + return value; + } catch (Throwable e) { + errorItems.add(1); + return null; + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 550136247c..5f39717d05 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,17 +1,11 @@ + package eu.dnetlib.dhp.transformation; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.ByteArrayInputStream; import java.util.HashMap; import java.util.Map; import java.util.Objects; + import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.spark.sql.Dataset; @@ -24,78 +18,87 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class TransformSparkJobNode { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - TransformSparkJobNode.class.getResourceAsStream( - "/eu/dnetlib/dhp/transformation/transformation_input_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + TransformSparkJobNode.class + .getResourceAsStream( + "/eu/dnetlib/dhp/transformation/transformation_input_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - final String inputPath = parser.get("input"); - final String outputPath = parser.get("output"); - final String workflowId = parser.get("workflowId"); - final String trasformationRule = - extractXSLTFromTR( - Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); - final String master = parser.get("master"); - final String rabbitUser = parser.get("rabbitUser"); - final String rabbitPassword = parser.get("rabbitPassword"); - final String rabbitHost = parser.get("rabbitHost"); - final String rabbitReportQueue = parser.get("rabbitReportQueue"); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final boolean test = - parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final String inputPath = parser.get("input"); + final String outputPath = parser.get("output"); + final String workflowId = parser.get("workflowId"); + final String trasformationRule = extractXSLTFromTR( + Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); + final String master = parser.get("master"); + final String rabbitUser = parser.get("rabbitUser"); + final String rabbitPassword = parser.get("rabbitPassword"); + final String rabbitHost = parser.get("rabbitHost"); + final String rabbitReportQueue = parser.get("rabbitReportQueue"); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - final SparkSession spark = - SparkSession.builder().appName("TransformStoreSparkJob").master(master).getOrCreate(); + final SparkSession spark = SparkSession + .builder() + .appName("TransformStoreSparkJob") + .master(master) + .getOrCreate(); - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = - spark.read().format("parquet").load(inputPath).as(encoder); - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = - spark.sparkContext().longAccumulator("transformedItems"); - final Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = - new TransformFunction( - totalItems, - errorItems, - transformedItems, - trasformationRule, - dateOfCollection, - vocabularies); - mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); - if (rabbitHost != null) { - System.out.println("SEND FINAL REPORT"); - final Map reportMap = new HashMap<>(); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + errorItems.value()); - reportMap.put("mdStoreSize", "" + transformedItems.value()); - System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); - if (!test) { - final MessageManager manager = - new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, null); - manager.sendMessage( - new Message(workflowId, "Transform", MessageType.REPORT, reportMap), - rabbitReportQueue, - true, - false); - manager.close(); - } - } - } + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); + final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); + final Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + final TransformFunction transformFunction = new TransformFunction( + totalItems, + errorItems, + transformedItems, + trasformationRule, + dateOfCollection, + vocabularies); + mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); + if (rabbitHost != null) { + System.out.println("SEND FINAL REPORT"); + final Map reportMap = new HashMap<>(); + reportMap.put("inputItem", "" + totalItems.value()); + reportMap.put("invalidRecords", "" + errorItems.value()); + reportMap.put("mdStoreSize", "" + transformedItems.value()); + System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); + if (!test) { + final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, + null); + manager + .sendMessage( + new Message(workflowId, "Transform", MessageType.REPORT, reportMap), + rabbitReportQueue, + true, + false); + manager.close(); + } + } + } - private static String extractXSLTFromTR(final String tr) throws DocumentException { - SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - return node.asXML(); - } + private static String extractXSLTFromTR(final String tr) throws DocumentException { + SAXReader reader = new SAXReader(); + Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + return node.asXML(); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java index 09e77613c0..7f9b6646c3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java @@ -1,48 +1,52 @@ + package eu.dnetlib.dhp.transformation.functions; +import java.util.Map; +import java.util.Optional; + import eu.dnetlib.dhp.transformation.vocabulary.Term; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import java.util.Map; -import java.util.Optional; import net.sf.saxon.s9api.*; import scala.Serializable; public class Cleaner implements ExtensionFunction, Serializable { - private final Map vocabularies; + private final Map vocabularies; - public Cleaner(Map vocabularies) { - this.vocabularies = vocabularies; - } + public Cleaner(Map vocabularies) { + this.vocabularies = vocabularies; + } - @Override - public QName getName() { - return new QName("http://eu/dnetlib/trasform/extension", "clean"); - } + @Override + public QName getName() { + return new QName("http://eu/dnetlib/trasform/extension", "clean"); + } - @Override - public SequenceType getResultType() { - return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); - } + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) - }; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) + }; + } - @Override - public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { - final String currentValue = xdmValues[0].itemAt(0).getStringValue(); - final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = - vocabularies.get(vocabularyName).getTerms().stream() - .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) - .findAny(); + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); + Optional cleanedValue = vocabularies + .get(vocabularyName) + .getTerms() + .stream() + .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) + .findAny(); - return new XdmAtomicValue( - cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); - } + return new XdmAtomicValue( + cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java index 813a779416..b5ac18169e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java @@ -1,52 +1,53 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import java.io.Serializable; public class Term implements Serializable { - private String englishName; - private String nativeName; - private String encoding; - private String code; - private String synonyms; + private String englishName; + private String nativeName; + private String encoding; + private String code; + private String synonyms; - public String getEnglishName() { - return englishName; - } + public String getEnglishName() { + return englishName; + } - public void setEnglishName(String englishName) { - this.englishName = englishName; - } + public void setEnglishName(String englishName) { + this.englishName = englishName; + } - public String getNativeName() { - return nativeName; - } + public String getNativeName() { + return nativeName; + } - public void setNativeName(String nativeName) { - this.nativeName = nativeName; - } + public void setNativeName(String nativeName) { + this.nativeName = nativeName; + } - public String getEncoding() { - return encoding; - } + public String getEncoding() { + return encoding; + } - public void setEncoding(String encoding) { - this.encoding = encoding; - } + public void setEncoding(String encoding) { + this.encoding = encoding; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public String getSynonyms() { - return synonyms; - } + public String getSynonyms() { + return synonyms; + } - public void setSynonyms(String synonyms) { - this.synonyms = synonyms; - } + public void setSynonyms(String synonyms) { + this.synonyms = synonyms; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java index 0579c82440..a9da6b7256 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import java.io.Serializable; @@ -5,49 +6,49 @@ import java.util.List; public class Vocabulary implements Serializable { - private String id; - private String name; - private String description; - private String code; - private List terms; + private String id; + private String name; + private String description; + private String code; + private List terms; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getDescription() { - return description; - } + public String getDescription() { + return description; + } - public void setDescription(String description) { - this.description = description; - } + public void setDescription(String description) { + this.description = description; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public List getTerms() { - return terms; - } + public List getTerms() { + return terms; + } - public void setTerms(List terms) { - this.terms = terms; - } + public void setTerms(List terms) { + this.terms = terms; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java index 349fc53de4..10e959be05 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java @@ -1,21 +1,24 @@ + package eu.dnetlib.dhp.transformation.vocabulary; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.Serializable; import java.net.URL; import java.nio.charset.Charset; + import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class VocabularyHelper implements Serializable { - private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; + private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; - public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { - final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); + public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { + final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); - final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); - return vocabulary; - } + final String response = IOUtils.toString(url, Charset.defaultCharset()); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); + return vocabulary; + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index cbf0cfd01d..44364b30a5 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -1,116 +1,121 @@ + package eu.dnetlib.dhp.collection; import static org.junit.jupiter.api.Assertions.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; + public class CollectionJobTest { - private Path testDir; + private Path testDir; - @BeforeEach - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } + @BeforeEach + public void setup() throws IOException { + testDir = Files.createTempDirectory("dhp-collection"); + } - @AfterEach - public void teadDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } + @AfterEach + public void teadDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + } - @Test - public void tesCollection() throws Exception { - final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - GenerateNativeStoreSparkJob.main( - new String[] { - "-mt", - "local", - "-w", - "wid", - "-e", - "XML", - "-d", - "" + System.currentTimeMillis(), - "-p", - new ObjectMapper().writeValueAsString(provenance), - "-x", - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", - this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", - testDir.toString() + "/store", - "-t", - "true", - "-ru", - "", - "-rp", - "", - "-rh", - "", - "-ro", - "", - "-rr", - "" - }); - System.out.println(new ObjectMapper().writeValueAsString(provenance)); - } + @Test + public void tesCollection() throws Exception { + final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); + GenerateNativeStoreSparkJob + .main( + new String[] { + "-mt", + "local", + "-w", + "wid", + "-e", + "XML", + "-d", + "" + System.currentTimeMillis(), + "-p", + new ObjectMapper().writeValueAsString(provenance), + "-x", + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "-i", + this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), + "-o", + testDir.toString() + "/store", + "-t", + "true", + "-ru", + "", + "-rp", + "", + "-rh", + "", + "-ro", + "", + "-rr", + "" + }); + System.out.println(new ObjectMapper().writeValueAsString(provenance)); + } - @Test - public void testGenerationMetadataRecord() throws Exception { + @Test + public void testGenerationMetadataRecord() throws Exception { - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); - assert record != null; - System.out.println(record.getId()); - System.out.println(record.getOriginalId()); - } + assert record != null; + System.out.println(record.getId()); + System.out.println(record.getOriginalId()); + } - @Test - public void TestEquals() throws IOException { + @Test + public void TestEquals() throws IOException { - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - final MetadataRecord record1 = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - assert record != null; - record.setBody("ciao"); - assert record1 != null; - record1.setBody("mondo"); - assertEquals(record, record1); - } + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + assert record != null; + record.setBody("ciao"); + assert record1 != null; + record1.setBody("mondo"); + assertEquals(record, record1); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index a524d75e7f..1a4fafb66c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -1,88 +1,92 @@ + package eu.dnetlib.dhp.collector.worker; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.*; +import java.io.File; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; -import java.io.File; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; public class DnetCollectorWorkerApplicationTests { - private ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); - private MessageManager messageManager = mock(MessageManager.class); + private ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); + private MessageManager messageManager = mock(MessageManager.class); - private DnetCollectorWorker worker; + private DnetCollectorWorker worker; - @BeforeEach - public void setup() throws Exception { - ObjectMapper mapper = new ObjectMapper(); - final String apiJson = mapper.writeValueAsString(getApi()); - when(argumentParser.get("apidescriptor")).thenReturn(apiJson); - when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); - when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); - when(argumentParser.get("userHDFS")).thenReturn("sandro"); - when(argumentParser.get("workflowId")).thenReturn("sandro"); - when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); + @BeforeEach + public void setup() throws Exception { + ObjectMapper mapper = new ObjectMapper(); + final String apiJson = mapper.writeValueAsString(getApi()); + when(argumentParser.get("apidescriptor")).thenReturn(apiJson); + when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); + when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); + when(argumentParser.get("userHDFS")).thenReturn("sandro"); + when(argumentParser.get("workflowId")).thenReturn("sandro"); + when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); - when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) - .thenAnswer( - a -> { - System.out.println("sent message: " + a.getArguments()[0]); - return true; - }); - when(messageManager.sendMessage(any(Message.class), anyString())) - .thenAnswer( - a -> { - System.out.println("Called"); - return true; - }); - worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); - } + when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) + .thenAnswer( + a -> { + System.out.println("sent message: " + a.getArguments()[0]); + return true; + }); + when(messageManager.sendMessage(any(Message.class), anyString())) + .thenAnswer( + a -> { + System.out.println("Called"); + return true; + }); + worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); + } - @AfterEach - public void dropDown() { - File f = new File("/tmp/file.seq"); - f.delete(); - } + @AfterEach + public void dropDown() { + File f = new File("/tmp/file.seq"); + f.delete(); + } - @Test - public void testFindPlugin() throws Exception { - final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); - } + @Test + public void testFindPlugin() throws Exception { + final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); + } - @Test - public void testCollectionOAI() throws Exception { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - ObjectMapper mapper = new ObjectMapper(); - assertNotNull(mapper.writeValueAsString(api)); - } + @Test + public void testCollectionOAI() throws Exception { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + ObjectMapper mapper = new ObjectMapper(); + assertNotNull(mapper.writeValueAsString(api)); + } - @Test - public void testFeeding() throws Exception { - worker.collect(); - } + @Test + public void testFeeding() throws Exception { + worker.collect(); + } - private ApiDescriptor getApi() { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - return api; - } + private ApiDescriptor getApi() { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + return api; + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 12a89053e4..01c9e31030 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,19 +1,16 @@ + package eu.dnetlib.dhp.transformation; import static org.junit.jupiter.api.Assertions.assertNotNull; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.StringWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; import java.util.Map; + import javax.xml.transform.stream.StreamSource; -import net.sf.saxon.s9api.*; + import org.apache.commons.io.IOUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -26,127 +23,133 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.functions.Cleaner; +import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; +import eu.dnetlib.dhp.utils.DHPUtils; +import net.sf.saxon.s9api.*; + @ExtendWith(MockitoExtension.class) public class TransformationJobTest { - @Mock private LongAccumulator accumulator; + @Mock + private LongAccumulator accumulator; - @Test - public void testTransformSaxonHE() throws Exception { + @Test + public void testTransformSaxonHE() throws Exception { - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - Cleaner cleanFunction = new Cleaner(vocabularies); - Processor proc = new Processor(false); - proc.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = proc.newXsltCompiler(); - XsltExecutable exp = - comp.compile( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); - XdmNode source = - proc.newDocumentBuilder() - .build( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - XsltTransformer trans = exp.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = proc.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - System.out.println(output.toString()); - } + Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + Cleaner cleanFunction = new Cleaner(vocabularies); + Processor proc = new Processor(false); + proc.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = proc.newXsltCompiler(); + XsltExecutable exp = comp + .compile( + new StreamSource( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); + XdmNode source = proc + .newDocumentBuilder() + .build( + new StreamSource( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + XsltTransformer trans = exp.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = proc.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + System.out.println(output.toString()); + } - @DisplayName("Test TransformSparkJobNode.main") - @Test - public void transformTest(@TempDir Path testDir) throws Exception { - final String mdstore_input = - this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - final String mdstore_output = testDir.toString() + "/version"; - final String xslt = - DHPUtils.compressString( - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); - TransformSparkJobNode.main( - new String[] { - "-mt", - "local", - "-i", - mdstore_input, - "-o", - mdstore_output, - "-d", - "1", - "-w", - "1", - "-tr", - xslt, - "-t", - "true", - "-ru", - "", - "-rp", - "", - "-rh", - "", - "-ro", - "", - "-rr", - "" - }); - } + @DisplayName("Test TransformSparkJobNode.main") + @Test + public void transformTest(@TempDir Path testDir) throws Exception { + final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String mdstore_output = testDir.toString() + "/version"; + final String xslt = DHPUtils + .compressString( + IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); + TransformSparkJobNode + .main( + new String[] { + "-mt", + "local", + "-i", + mdstore_input, + "-o", + mdstore_output, + "-d", + "1", + "-w", + "1", + "-tr", + xslt, + "-t", + "true", + "-ru", + "", + "-rp", + "", + "-rh", + "", + "-ro", + "", + "-rr", + "" + }); + } - @Test - public void tryLoadFolderOnCP() throws Exception { - final String path = - this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - System.out.println("path = " + path); + @Test + public void tryLoadFolderOnCP() throws Exception { + final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + System.out.println("path = " + path); - Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); + Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); - System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); + System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); - Files.deleteIfExists(tempDirWithPrefix); - } + Files.deleteIfExists(tempDirWithPrefix); + } - @Test - public void testTransformFunction() throws Exception { - SAXReader reader = new SAXReader(); - Document document = - reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - final String xslt = node.asXML(); - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + @Test + public void testTransformFunction() throws Exception { + SAXReader reader = new SAXReader(); + Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + final String xslt = node.asXML(); + Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - TransformFunction tf = - new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); + TransformFunction tf = new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); - MetadataRecord record = new MetadataRecord(); - record.setBody( - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + MetadataRecord record = new MetadataRecord(); + record + .setBody( + IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - final MetadataRecord result = tf.call(record); - assertNotNull(result.getBody()); + final MetadataRecord result = tf.call(record); + assertNotNull(result.getBody()); - System.out.println(result.getBody()); - } + System.out.println(result.getBody()); + } - @Test - public void extractTr() throws Exception { + @Test + public void extractTr() throws Exception { - final String xmlTr = - IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + final String xmlTr = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - SAXReader reader = new SAXReader(); - Document document = - reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + SAXReader reader = new SAXReader(); + Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - System.out.println(node.asXML()); - } + System.out.println(node.asXML()); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java index 3732c5e82c..1ae942a6b8 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import static org.junit.jupiter.api.Assertions.*; @@ -6,10 +7,10 @@ import org.junit.jupiter.api.Test; public class VocabularyTest { - @Test - public void testLoadVocabulary() throws Exception { + @Test + public void testLoadVocabulary() throws Exception { - final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); - assertEquals("dnet:languages", vocabulary.getName()); - } + final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); + assertEquals("dnet:languages", vocabulary.getName()); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 0c4a77be9d..2120da0808 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -1,18 +1,12 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.List; + import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SaveMode; @@ -22,72 +16,82 @@ import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; + abstract class AbstractSparkAction implements Serializable { - protected static final ObjectMapper OBJECT_MAPPER = - new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public ArgumentApplicationParser parser; // parameters for the spark action - public SparkSession spark; // the spark session + public ArgumentApplicationParser parser; // parameters for the spark action + public SparkSession spark; // the spark session - public AbstractSparkAction(ArgumentApplicationParser parser, SparkSession spark) { + public AbstractSparkAction(ArgumentApplicationParser parser, SparkSession spark) { - this.parser = parser; - this.spark = spark; - } + this.parser = parser; + this.spark = spark; + } - public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) - throws ISLookUpException, DocumentException, IOException { + public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) + throws ISLookUpException, DocumentException, IOException { - final String xquery = - String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); - String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); - final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); - final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); - final List configurations = new ArrayList<>(); + final List configurations = new ArrayList<>(); - for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { - configurations.add(loadConfig(isLookUpService, actionSetId, o)); - } + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } - return configurations; - } + return configurations; + } - private DedupConfig loadConfig( - final ISLookUpService isLookUpService, final String actionSetId, final Object o) - throws ISLookUpException, IOException { - final Element s = (Element) o; - final String configProfileId = s.attributeValue("id"); - final String conf = - isLookUpService.getResourceProfileByQuery( - String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); + private DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException, IOException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); - DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); - dedupConfig.getWf().setConfigurationId(actionSetId); + DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + dedupConfig.getWf().setConfigurationId(actionSetId); - return dedupConfig; - } + return dedupConfig; + } - abstract void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException; + abstract void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException; - protected static SparkSession getSparkSession(SparkConf conf) { - return SparkSession.builder().config(conf).getOrCreate(); - } + protected static SparkSession getSparkSession(SparkConf conf) { + return SparkSession.builder().config(conf).getOrCreate(); + } - protected static void save(Dataset dataset, String outPath, SaveMode mode) { - dataset.write().option("compression", "gzip").mode(mode).json(outPath); - } + protected static void save(Dataset dataset, String outPath, SaveMode mode) { + dataset.write().option("compression", "gzip").mode(mode).json(outPath); + } - protected static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + protected static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java index 098d024f4b..70fb2cc5b6 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java @@ -1,113 +1,121 @@ + package eu.dnetlib.dhp.oa.dedup; import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; + import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; -import eu.dnetlib.dhp.schema.oaf.Field; import java.time.Year; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.lang.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Field; + public class DatePicker { - private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; - private static final String DATE_DEFAULT_SUFFIX = "01-01"; - private static final int YEAR_LB = 1300; - private static final int YEAR_UB = Year.now().getValue() + 5; + private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; + private static final String DATE_DEFAULT_SUFFIX = "01-01"; + private static final int YEAR_LB = 1300; + private static final int YEAR_UB = Year.now().getValue() + 5; - public static Field pick(final Collection dateofacceptance) { + public static Field pick(final Collection dateofacceptance) { - final Map frequencies = - dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); + final Map frequencies = dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); - if (frequencies.isEmpty()) { - return new Field<>(); - } + if (frequencies.isEmpty()) { + return new Field<>(); + } - final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + final Field date = new Field<>(); + date.setValue(frequencies.keySet().iterator().next()); - // let's sort this map by values first, filtering out invalid dates - final Map sorted = - frequencies.entrySet().stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); + // let's sort this map by values first, filtering out invalid dates + final Map sorted = frequencies + .entrySet() + .stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); - // shortcut - if (sorted.size() == 0) { - return date; - } + // shortcut + if (sorted.size() == 0) { + return date; + } - // voting method (1/3 + 1) wins - if (sorted.size() >= 3) { - final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = - sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + // voting method (1/3 + 1) wins + if (sorted.size() >= 3) { + final int acceptThreshold = (sorted.size() / 3) + 1; + final List accepted = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); - // cannot find strong majority - if (accepted.isEmpty()) { - final int max = sorted.values().iterator().next(); - Optional first = - sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + // cannot find strong majority + if (accepted.isEmpty()) { + final int max = sorted.values().iterator().next(); + Optional first = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - date.setValue(sorted.keySet().iterator().next()); - return date; - } + date.setValue(sorted.keySet().iterator().next()); + return date; + } - if (accepted.size() == 1) { - date.setValue(accepted.get(0)); - return date; - } else { - final Optional first = - accepted.stream().filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)).findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + if (accepted.size() == 1) { + date.setValue(accepted.get(0)); + return date; + } else { + final Optional first = accepted + .stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - return date; - } + return date; + } - // 1st non YYYY-01-01 is returned - } else { - if (sorted.size() == 2) { - for (Map.Entry e : sorted.entrySet()) { - if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { - date.setValue(e.getKey()); - return date; - } - } - } + // 1st non YYYY-01-01 is returned + } else { + if (sorted.size() == 2) { + for (Map.Entry e : sorted.entrySet()) { + if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { + date.setValue(e.getKey()); + return date; + } + } + } - // none of the dates seems good enough, return the 1st one - date.setValue(sorted.keySet().iterator().next()); - return date; - } - } + // none of the dates seems good enough, return the 1st one + date.setValue(sorted.keySet().iterator().next()); + return date; + } + } - private static boolean inRange(final String date) { - final int year = Integer.parseInt(substringBefore(date, "-")); - return year >= YEAR_LB && year <= YEAR_UB; - } + private static boolean inRange(final String date) { + final int year = Integer.parseInt(substringBefore(date, "-")); + return year >= YEAR_LB && year <= YEAR_UB; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 5a806c0a0d..fa06424d7a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,12 +1,9 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Collection; import java.util.Iterator; + import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; @@ -14,92 +11,96 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class DedupRecordFactory { - private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class); + private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class); - protected static final ObjectMapper OBJECT_MAPPER = - new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public static Dataset createDedupRecord( - final SparkSession spark, - final DataInfo dataInfo, - final String mergeRelsInputPath, - final String entitiesInputPath, - final Class clazz) { + public static Dataset createDedupRecord( + final SparkSession spark, + final DataInfo dataInfo, + final String mergeRelsInputPath, + final String entitiesInputPath, + final Class clazz) { - long ts = System.currentTimeMillis(); + long ts = System.currentTimeMillis(); - // - Dataset> entities = - spark - .read() - .textFile(entitiesInputPath) - .map( - (MapFunction>) - it -> { - T entity = OBJECT_MAPPER.readValue(it, clazz); - return new Tuple2<>(entity.getId(), entity); - }, - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + // + Dataset> entities = spark + .read() + .textFile(entitiesInputPath) + .map( + (MapFunction>) it -> { + T entity = OBJECT_MAPPER.readValue(it, clazz); + return new Tuple2<>(entity.getId(), entity); + }, + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - // : source is the dedup_id, target is the id of the mergedIn - Dataset> mergeRels = - spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'") - .map( - (MapFunction>) - r -> new Tuple2<>(r.getSource(), r.getTarget()), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + // : source is the dedup_id, target is the id of the mergedIn + Dataset> mergeRels = spark + .read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'") + .map( + (MapFunction>) r -> new Tuple2<>(r.getSource(), r.getTarget()), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - return mergeRels - .joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner") - .map( - (MapFunction, Tuple2>, Tuple2>) - value -> new Tuple2<>(value._1()._1(), value._2()._2()), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) - .groupByKey( - (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) - .mapGroups( - (MapGroupsFunction, T>) - (key, values) -> entityMerger(key, values, ts, dataInfo), - Encoders.bean(clazz)); - } + return mergeRels + .joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner") + .map( + (MapFunction, Tuple2>, Tuple2>) value -> new Tuple2<>( + value._1()._1(), value._2()._2()), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) + .groupByKey( + (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, T>) (key, + values) -> entityMerger(key, values, ts, dataInfo), + Encoders.bean(clazz)); + } - private static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo) { + private static T entityMerger( + String id, Iterator> entities, long ts, DataInfo dataInfo) { - T entity = entities.next()._2(); + T entity = entities.next()._2(); - final Collection dates = Lists.newArrayList(); - entities.forEachRemaining( - t -> { - T duplicate = t._2(); - entity.mergeFrom(duplicate); - if (ModelSupport.isSubClass(duplicate, Result.class)) { - Result r1 = (Result) duplicate; - Result er = (Result) entity; - er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); + final Collection dates = Lists.newArrayList(); + entities + .forEachRemaining( + t -> { + T duplicate = t._2(); + entity.mergeFrom(duplicate); + if (ModelSupport.isSubClass(duplicate, Result.class)) { + Result r1 = (Result) duplicate; + Result er = (Result) entity; + er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); - if (r1.getDateofacceptance() != null) { - dates.add(r1.getDateofacceptance().getValue()); - } - } - }); + if (r1.getDateofacceptance() != null) { + dates.add(r1.getDateofacceptance().getValue()); + } + } + }); - if (ModelSupport.isSubClass(entity, Result.class)) { - ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); - } + if (ModelSupport.isSubClass(entity, Result.class)) { + ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); + } - entity.setId(id); - entity.setLastupdatetimestamp(ts); - entity.setDataInfo(dataInfo); + entity.setId(id); + entity.setLastupdatetimestamp(ts); + entity.setDataInfo(dataInfo); - return entity; - } + return entity; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index f239e072f8..4f797f7f77 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -1,7 +1,24 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.StringReader; +import java.security.MessageDigest; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkContext; +import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; + import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; + import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.ISLookupClientFactory; @@ -11,230 +28,222 @@ import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; -import java.io.StringReader; -import java.security.MessageDigest; -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; -import org.apache.commons.codec.binary.Hex; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.SparkContext; -import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { - Map accumulators = new HashMap<>(); + Map accumulators = new HashMap<>(); - String acc1 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String.format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = - String.format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); - return accumulators; - } + return accumulators; + } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } + static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) return; - final Map basePidAuthorMap = - base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = - enrich.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> - a.getPid().stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich.forEach( - a -> { - Optional> simAuhtor = - base.stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } + pidToEnrich + .forEach( + a -> { + Optional> simAuhtor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); + } - public static String createDedupRecordPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); - } + public static String createDedupRecordPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); + } - public static String createEntityPath(final String basePath, final String entityType) { - return String.format("%s/%s", basePath, entityType); - } + public static String createEntityPath(final String basePath, final String entityType) { + return String.format("%s/%s", basePath, entityType); + } - public static String createSimRelPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); - } + public static String createSimRelPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); + } - public static String createMergeRelPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); - } + public static String createMergeRelPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); + } - private static Double sim(Author a, Author b) { + private static Double sim(Author a, Author b) { - final Person pa = parse(a); - final Person pb = parse(b); + final Person pa = parse(a); + final Person pb = parse(b); - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } - private static int countAuthorsPids(List authors) { - if (authors == null) return 0; + private static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } + return (int) authors.stream().filter(DedupUtility::hasPid).count(); + } - private static int authorsSize(List authors) { - if (authors == null) return 0; - return authors.size(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } - public static List getConfigurations(String isLookUpUrl, String orchestrator) - throws ISLookUpException, DocumentException { - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + public static List getConfigurations(String isLookUpUrl, String orchestrator) + throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); - final String xquery = - String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); - String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); - final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); - final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); - final List configurations = new ArrayList<>(); + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); - for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { - configurations.add(loadConfig(isLookUpService, actionSetId, o)); - } + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } - return configurations; - } + return configurations; + } - private static DedupConfig loadConfig( - final ISLookUpService isLookUpService, final String actionSetId, final Object o) - throws ISLookUpException { - final Element s = (Element) o; - final String configProfileId = s.attributeValue("id"); - final String conf = - isLookUpService.getResourceProfileByQuery( - String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); - final DedupConfig dedupConfig = DedupConfig.load(conf); - dedupConfig.getWf().setConfigurationId(actionSetId); - return dedupConfig; - } + private static DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java index a7515d5757..c72940deb8 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java @@ -1,54 +1,57 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.util.LongAccumulator; + import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; public class Deduper implements Serializable { - public static JavaPairRDD computeRelations( - JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations( + JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config) - .processSortedBlock(it._1(), it._2().getDocuments(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) - .reduceByKey((a, b) -> a) - .mapToPair(Tuple2::_2); - } + return blocks + .flatMapToPair( + it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config) + .processSortedBlock(it._1(), it._2().getDocuments(), reporter); + return reporter.getRelations().iterator(); + }) + .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) + .reduceByKey((a, b) -> a) + .mapToPair(Tuple2::_2); + } - public static JavaPairRDD createSortedBlocks( - JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getGroupMaxSize(); + public static JavaPairRDD createSortedBlocks( + JavaPairRDD mapDocs, DedupConfig config) { + final String of = config.getWf().getOrderField(); + final int maxQueueSize = config.getWf().getGroupMaxSize(); - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMap( - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map(it -> Block.from(it, a)) - .collect(Collectors.toList()) - .iterator()) - .mapToPair(block -> new Tuple2<>(block.getKey(), block)) - .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); - } + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMap( + a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map(it -> Block.from(it, a)) + .collect(Collectors.toList()) + .iterator()) + .mapToPair(block -> new Tuple2<>(block.getKey(), block)) + .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index b47b880e93..d870f6256e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -1,5 +1,16 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -10,92 +21,84 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class SparkCreateDedupRecord extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); + private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); - public static final String ROOT_TRUST = "0.8"; - public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; - public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String ROOT_TRUST = "0.8"; + public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; + public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions"; - public SparkCreateDedupRecord(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateDedupRecord(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateDedupRecord(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkCreateDedupRecord(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) - throws ISLookUpException, DocumentException, IOException { + @Override + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating deduprecords for: '{}'", subEntity); + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating deduprecords for: '{}'", subEntity); - final String outputPath = - DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); - final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); - final DataInfo dataInfo = getDataInfo(dedupConf); - DedupRecordFactory.createDedupRecord(spark, dataInfo, mergeRelPath, entityPath, clazz) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); - } - } + final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); + final DataInfo dataInfo = getDataInfo(dedupConf); + DedupRecordFactory + .createDedupRecord(spark, dataInfo, mergeRelPath, entityPath, clazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + } - private static DataInfo getDataInfo(DedupConfig dedupConf) { - DataInfo info = new DataInfo(); - info.setDeletedbyinference(false); - info.setInferred(true); - info.setInvisible(false); - info.setTrust(ROOT_TRUST); - info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); - Qualifier provenance = new Qualifier(); - provenance.setClassid(PROVENANCE_ACTION_CLASS); - provenance.setClassname(PROVENANCE_ACTION_CLASS); - provenance.setSchemeid(PROVENANCE_ACTIONS); - provenance.setSchemename(PROVENANCE_ACTIONS); - info.setProvenanceaction(provenance); - return info; - } + private static DataInfo getDataInfo(DedupConfig dedupConf) { + DataInfo info = new DataInfo(); + info.setDeletedbyinference(false); + info.setInferred(true); + info.setInvisible(false); + info.setTrust(ROOT_TRUST); + info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); + Qualifier provenance = new Qualifier(); + provenance.setClassid(PROVENANCE_ACTION_CLASS); + provenance.setClassname(PROVENANCE_ACTION_CLASS); + provenance.setSchemeid(PROVENANCE_ACTIONS); + provenance.setSchemename(PROVENANCE_ACTIONS); + info.setProvenanceaction(provenance); + return info; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 229379a53d..a446508233 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -1,22 +1,11 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.google.common.hash.Hashing; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; -import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; @@ -31,132 +20,149 @@ import org.apache.spark.sql.SparkSession; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkCreateMergeRels extends AbstractSparkAction { - public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; - private static final Logger log = LoggerFactory.getLogger(SparkCreateMergeRels.class); - public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; + private static final Logger log = LoggerFactory.getLogger(SparkCreateMergeRels.class); + public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; - public SparkCreateMergeRels(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateMergeRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser.parseArgument(args); - final String isLookUpUrl = parser.get("isLookUpUrl"); - log.info("isLookupUrl {}", isLookUpUrl); + final String isLookUpUrl = parser.get("isLookUpUrl"); + log.info("isLookupUrl {}", isLookUpUrl); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateMergeRels(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); - } + new SparkCreateMergeRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); + } - @Override - public void run(ISLookUpService isLookUpService) - throws ISLookUpException, DocumentException, IOException { + @Override + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String subEntity = dedupConf.getWf().getSubEntityValue(); + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating mergerels for: '{}'", subEntity); + log.info("Creating mergerels for: '{}'", subEntity); - final int maxIterations = dedupConf.getWf().getMaxIterations(); - log.info("Max iterations {}", maxIterations); + final int maxIterations = dedupConf.getWf().getMaxIterations(); + log.info("Max iterations {}", maxIterations); - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - final JavaPairRDD vertexes = - sc.textFile(graphBasePath + "/" + subEntity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) s -> new Tuple2<>(hash(s), s)); + final JavaPairRDD vertexes = sc + .textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) s -> new Tuple2<>(hash(s), s)); - final RDD> edgeRdd = - spark - .read() - .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) - .as(Encoders.bean(Relation.class)) - .javaRDD() - .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) - .rdd(); + final RDD> edgeRdd = spark + .read() + .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) + .as(Encoders.bean(Relation.class)) + .javaRDD() + .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) + .rdd(); - final Dataset mergeRels = - spark.createDataset( - GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, maxIterations) - .toJavaRDD() - .filter(k -> k.getDocIds().size() > 1) - .flatMap(cc -> ccToMergeRel(cc, dedupConf)) - .rdd(), - Encoders.bean(Relation.class)); + final Dataset mergeRels = spark + .createDataset( + GraphProcessor + .findCCs(vertexes.rdd(), edgeRdd, maxIterations) + .toJavaRDD() + .filter(k -> k.getDocIds().size() > 1) + .flatMap(cc -> ccToMergeRel(cc, dedupConf)) + .rdd(), + Encoders.bean(Relation.class)); - mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); - } - } + mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); + } + } - public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { - return cc.getDocIds().stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); + public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { + return cc + .getDocIds() + .stream() + .flatMap( + id -> { + List tmp = new ArrayList<>(); - tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); - tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); + tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); + tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); - return tmp.stream(); - }) - .iterator(); - } + return tmp.stream(); + }) + .iterator(); + } - private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { - Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - r.setRelClass(relClass); - r.setSubRelType("dedup"); + private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { + Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass(relClass); + r.setSubRelType("dedup"); - DataInfo info = new DataInfo(); - info.setDeletedbyinference(false); - info.setInferred(true); - info.setInvisible(false); - info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); - Qualifier provenanceAction = new Qualifier(); - provenanceAction.setClassid(PROVENANCE_ACTION_CLASS); - provenanceAction.setClassname(PROVENANCE_ACTION_CLASS); - provenanceAction.setSchemeid(DNET_PROVENANCE_ACTIONS); - provenanceAction.setSchemename(DNET_PROVENANCE_ACTIONS); - info.setProvenanceaction(provenanceAction); + DataInfo info = new DataInfo(); + info.setDeletedbyinference(false); + info.setInferred(true); + info.setInvisible(false); + info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); + Qualifier provenanceAction = new Qualifier(); + provenanceAction.setClassid(PROVENANCE_ACTION_CLASS); + provenanceAction.setClassname(PROVENANCE_ACTION_CLASS); + provenanceAction.setSchemeid(DNET_PROVENANCE_ACTIONS); + provenanceAction.setSchemename(DNET_PROVENANCE_ACTIONS); + info.setProvenanceaction(provenanceAction); - // TODO calculate the trust value based on the similarity score of the elements in the CC - // info.setTrust(); + // TODO calculate the trust value based on the similarity score of the elements in the CC + // info.setTrust(); - r.setDataInfo(info); - return r; - } + r.setDataInfo(info); + return r; + } - public static long hash(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } + public static long hash(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index c5a1d768cd..2cfe2e0801 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -1,5 +1,21 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -12,117 +28,107 @@ import eu.dnetlib.pace.model.FieldListImpl; import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import scala.Tuple2; public class SparkCreateSimRels extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); - public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses( - new Class[] {MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class}); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf + .registerKryoClasses( + new Class[] { + MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class + }); - new SparkCreateSimRels(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkCreateSimRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException { + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException { - // read oozie parameters - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - // for each dedup configuration - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String entity = dedupConf.getWf().getEntityType(); - final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating simrels for: '{}'", subEntity); + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating simrels for: '{}'", subEntity); - final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = - sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .mapToPair( - (PairFunction) - s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocuments = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - // create blocks for deduplication - JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); - // create relations by comparing only elements in the same group - JavaRDD relations = - Deduper.computeRelations(sc, blocks, dedupConf) - .map(t -> createSimRel(t._1(), t._2(), entity)); + // create relations by comparing only elements in the same group + JavaRDD relations = Deduper + .computeRelations(sc, blocks, dedupConf) + .map(t -> createSimRel(t._1(), t._2(), entity)); - // save the simrel in the workingdir - spark - .createDataset(relations.rdd(), Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Append) - .save(outputPath); - } - } + // save the simrel in the workingdir + spark + .createDataset(relations.rdd(), Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .save(outputPath); + } + } - private Relation createSimRel(String source, String target, String entity) { - final Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - r.setSubRelType("dedupSimilarity"); - r.setRelClass("isSimilarTo"); - r.setDataInfo(new DataInfo()); + private Relation createSimRel(String source, String target, String entity) { + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setSubRelType("dedupSimilarity"); + r.setRelClass("isSimilarTo"); + r.setDataInfo(new DataInfo()); - switch (entity) { - case "result": - r.setRelType("resultResult"); - break; - case "organization": - r.setRelType("organizationOrganization"); - break; - default: - throw new IllegalArgumentException("unmanaged entity type: " + entity); - } - return r; - } + switch (entity) { + case "result": + r.setRelType("resultResult"); + break; + case "organization": + r.setRelType("organizationOrganization"); + break; + default: + throw new IllegalArgumentException("unmanaged entity type: " + entity); + } + return r; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index d829a9a03a..34611db8e2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -1,183 +1,178 @@ + package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import scala.Tuple2; public class SparkPropagateRelation extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); + private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); - enum FieldType { - SOURCE, - TARGET - } + enum FieldType { + SOURCE, TARGET + } - public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) - throws Exception { - super(parser, spark); - } + public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) + throws Exception { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkPropagateRelation(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkPropagateRelation(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) { + @Override + public void run(ISLookUpService isLookUpService) { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String dedupGraphPath = parser.get("dedupGraphPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("workingPath: '{}'", workingPath); - log.info("dedupGraphPath: '{}'", dedupGraphPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupGraphPath: '{}'", dedupGraphPath); - final String outputRelationPath = DedupUtility.createEntityPath(dedupGraphPath, "relation"); - removeOutputDir(spark, outputRelationPath); + final String outputRelationPath = DedupUtility.createEntityPath(dedupGraphPath, "relation"); + removeOutputDir(spark, outputRelationPath); - Dataset mergeRels = - spark - .read() - .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) - .as(Encoders.bean(Relation.class)); + Dataset mergeRels = spark + .read() + .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) + .as(Encoders.bean(Relation.class)); - Dataset> mergedIds = - mergeRels - .where(col("relClass").equalTo("merges")) - .select(col("source"), col("target")) - .distinct() - .map( - (MapFunction>) - r -> new Tuple2<>(r.getString(1), r.getString(0)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .cache(); + Dataset> mergedIds = mergeRels + .where(col("relClass").equalTo("merges")) + .select(col("source"), col("target")) + .distinct() + .map( + (MapFunction>) r -> new Tuple2<>(r.getString(1), r.getString(0)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .cache(); - final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); + final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); - Dataset rels = - spark.read().textFile(relationPath).map(patchRelFn(), Encoders.bean(Relation.class)); + Dataset rels = spark.read().textFile(relationPath).map(patchRelFn(), Encoders.bean(Relation.class)); - Dataset newRels = - processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getFixRelFn(FieldType.SOURCE)), - mergedIds, - FieldType.TARGET, - getFixRelFn(FieldType.TARGET)) - .filter(SparkPropagateRelation::containsDedup); + Dataset newRels = processDataset( + processDataset(rels, mergedIds, FieldType.SOURCE, getFixRelFn(FieldType.SOURCE)), + mergedIds, + FieldType.TARGET, + getFixRelFn(FieldType.TARGET)) + .filter(SparkPropagateRelation::containsDedup); - Dataset updated = - processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), - mergedIds, - FieldType.TARGET, - getDeletedFn()); + Dataset updated = processDataset( + processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), + mergedIds, + FieldType.TARGET, + getDeletedFn()); - save(newRels.union(updated), outputRelationPath, SaveMode.Overwrite); - } + save(newRels.union(updated), outputRelationPath, SaveMode.Overwrite); + } - private static Dataset processDataset( - Dataset rels, - Dataset> mergedIds, - FieldType type, - MapFunction, Tuple2>, Relation> mapFn) { - final Dataset> mapped = - rels.map( - (MapFunction>) r -> new Tuple2<>(getId(r, type), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); - return mapped - .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") - .map(mapFn, Encoders.bean(Relation.class)); - } + private static Dataset processDataset( + Dataset rels, + Dataset> mergedIds, + FieldType type, + MapFunction, Tuple2>, Relation> mapFn) { + final Dataset> mapped = rels + .map( + (MapFunction>) r -> new Tuple2<>(getId(r, type), r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); + return mapped + .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") + .map(mapFn, Encoders.bean(Relation.class)); + } - private static MapFunction patchRelFn() { - return value -> { - final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class); - if (rel.getDataInfo() == null) { - rel.setDataInfo(new DataInfo()); - } - return rel; - }; - } + private static MapFunction patchRelFn() { + return value -> { + final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class); + if (rel.getDataInfo() == null) { + rel.setDataInfo(new DataInfo()); + } + return rel; + }; + } - private static String getId(Relation r, FieldType type) { - switch (type) { - case SOURCE: - return r.getSource(); - case TARGET: - return r.getTarget(); - default: - throw new IllegalArgumentException(""); - } - } + private static String getId(Relation r, FieldType type) { + switch (type) { + case SOURCE: + return r.getSource(); + case TARGET: + return r.getTarget(); + default: + throw new IllegalArgumentException(""); + } + } - private static MapFunction, Tuple2>, Relation> - getFixRelFn(FieldType type) { - return value -> { - if (value._2() != null) { - Relation r = value._1()._2(); - String id = value._2()._2(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(false); - switch (type) { - case SOURCE: - r.setSource(id); - return r; - case TARGET: - r.setTarget(id); - return r; - default: - throw new IllegalArgumentException(""); - } - } - return value._1()._2(); - }; - } + private static MapFunction, Tuple2>, Relation> getFixRelFn( + FieldType type) { + return value -> { + if (value._2() != null) { + Relation r = value._1()._2(); + String id = value._2()._2(); + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + r.setSource(id); + return r; + case TARGET: + r.setTarget(id); + return r; + default: + throw new IllegalArgumentException(""); + } + } + return value._1()._2(); + }; + } - private static MapFunction, Tuple2>, Relation> - getDeletedFn() { - return value -> { - if (value._2() != null) { - Relation r = value._1()._2(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(true); - return r; - } - return value._1()._2(); - }; - } + private static MapFunction, Tuple2>, Relation> getDeletedFn() { + return value -> { + if (value._2() != null) { + Relation r = value._1()._2(); + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(true); + return r; + } + return value._1()._2(); + }; + } - private static boolean containsDedup(final Relation r) { - return r.getSource().toLowerCase().contains("dedup") - || r.getTarget().toLowerCase().contains("dedup"); - } + private static boolean containsDedup(final Relation r) { + return r.getSource().toLowerCase().contains("dedup") + || r.getTarget().toLowerCase().contains("dedup"); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java index 2ffd982b14..7100c90372 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java @@ -1,47 +1,50 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.pace.util.Reporter; import java.util.ArrayList; import java.util.List; import java.util.Map; + import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.util.Reporter; import scala.Serializable; import scala.Tuple2; public class SparkReporter implements Serializable, Reporter { - private final List> relations = new ArrayList<>(); + private final List> relations = new ArrayList<>(); - private Map accumulators; + private Map accumulators; - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { - incrementCounter(counterGroup, counterName, delta, accumulators); - } + incrementCounter(counterGroup, counterName, delta, accumulators); + } - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } - public List> getRelations() { - return relations; - } + public List> getRelations() { + return relations; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java index 56dec79cf4..779fb91d69 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java @@ -1,15 +1,8 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -27,123 +20,133 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkUpdateEntity extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class); + private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class); - private static final String IDJSONPATH = "$.id"; + private static final String IDJSONPATH = "$.id"; - public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntity.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkUpdateEntity(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkUpdateEntity(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - public void run(ISLookUpService isLookUpService) throws IOException { + public void run(ISLookUpService isLookUpService) throws IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String dedupGraphPath = parser.get("dedupGraphPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("workingPath: '{}'", workingPath); - log.info("dedupGraphPath: '{}'", dedupGraphPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupGraphPath: '{}'", dedupGraphPath); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - // for each entity - ModelSupport.entityTypes.forEach( - (type, clazz) -> { - final String outputPath = dedupGraphPath + "/" + type; - removeOutputDir(spark, outputPath); + // for each entity + ModelSupport.entityTypes + .forEach( + (type, clazz) -> { + final String outputPath = dedupGraphPath + "/" + type; + removeOutputDir(spark, outputPath); - JavaRDD sourceEntity = - sc.textFile(DedupUtility.createEntityPath(graphBasePath, type.toString())); + JavaRDD sourceEntity = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, type.toString())); - if (mergeRelExists(workingPath, type.toString())) { + if (mergeRelExists(workingPath, type.toString())) { - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, "*", type.toString()); - final String dedupRecordPath = - DedupUtility.createDedupRecordPath(workingPath, "*", type.toString()); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, "*", type.toString()); + final String dedupRecordPath = DedupUtility + .createDedupRecordPath(workingPath, "*", type.toString()); - final Dataset rel = - spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); + final Dataset rel = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - rel.where("relClass == 'merges'") - .select(rel.col("target")) - .distinct() - .toJavaRDD() - .mapToPair( - (PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final JavaPairRDD mergedIds = rel + .where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - JavaPairRDD entitiesWithId = - sourceEntity.mapToPair( - (PairFunction) - s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); - JavaRDD map = - entitiesWithId - .leftOuterJoin(mergedIds) - .map( - k -> - k._2()._2().isPresent() - ? updateDeletedByInference(k._2()._1(), clazz) - : k._2()._1()); + JavaPairRDD entitiesWithId = sourceEntity + .mapToPair( + (PairFunction) s -> new Tuple2<>( + MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + JavaRDD map = entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> k._2()._2().isPresent() + ? updateDeletedByInference(k._2()._1(), clazz) + : k._2()._1()); - sourceEntity = map.union(sc.textFile(dedupRecordPath)); - } + sourceEntity = map.union(sc.textFile(dedupRecordPath)); + } - sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); - }); - } + sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); + }); + } - public boolean mergeRelExists(String basePath, String entity) { + public boolean mergeRelExists(String basePath, String entity) { - boolean result = false; - try { - FileSystem fileSystem = FileSystem.get(new Configuration()); + boolean result = false; + try { + FileSystem fileSystem = FileSystem.get(new Configuration()); - FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); - for (FileStatus fs : fileStatuses) { - if (fs.isDirectory()) - if (fileSystem.exists( - new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) - result = true; - } + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) + if (fileSystem + .exists( + new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + result = true; + } - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } - } + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } - private static String updateDeletedByInference( - final String json, final Class clazz) { - try { - Oaf entity = OBJECT_MAPPER.readValue(json, clazz); - if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); - entity.getDataInfo().setDeletedbyinference(true); - return OBJECT_MAPPER.writeValueAsString(entity); - } catch (IOException e) { - throw new RuntimeException("Unable to convert json", e); - } - } + private static String updateDeletedByInference( + final String json, final Class clazz) { + try { + Oaf entity = OBJECT_MAPPER.readValue(json, clazz); + if (entity.getDataInfo() == null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return OBJECT_MAPPER.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index f4370a79c3..bfd2c25e21 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -1,78 +1,84 @@ + package eu.dnetlib.dhp.oa.dedup.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.dedup.DedupUtility; -import eu.dnetlib.pace.util.PaceException; import java.io.IOException; import java.io.Serializable; import java.util.Set; + import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.dedup.DedupUtility; +import eu.dnetlib.pace.util.PaceException; + public class ConnectedComponent implements Serializable { - private Set docIds; - private String ccId; + private Set docIds; + private String ccId; - public ConnectedComponent() {} + public ConnectedComponent() { + } - public ConnectedComponent(Set docIds) { - this.docIds = docIds; - createID(); - } + public ConnectedComponent(Set docIds) { + this.docIds = docIds; + createID(); + } - public String createID() { - if (docIds.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); - return ccId; - } else { - return docIds.iterator().next(); - } - } + public String createID() { + if (docIds.size() > 1) { + final String s = getMin(); + String prefix = s.split("\\|")[0]; + ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); + return ccId; + } else { + return docIds.iterator().next(); + } + } - @JsonIgnore - public String getMin() { + @JsonIgnore + public String getMin() { - final StringBuilder min = new StringBuilder(); - docIds.forEach( - i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); - return min.toString(); - } + final StringBuilder min = new StringBuilder(); + docIds + .forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); + return min.toString(); + } - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } + @Override + public String toString() { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Failed to create Json: ", e); + } + } - public Set getDocIds() { - return docIds; - } + public Set getDocIds() { + return docIds; + } - public void setDocIds(Set docIds) { - this.docIds = docIds; - } + public void setDocIds(Set docIds) { + this.docIds = docIds; + } - public String getCcId() { - return ccId; - } + public String getCcId() { + return ccId; + } - public void setCcId(String ccId) { - this.ccId = ccId; - } + public void setCcId(String ccId) { + this.ccId = ccId; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java index 10b622497e..4f0d95c8f5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java @@ -1,7 +1,6 @@ + package eu.dnetlib.dhp.oa.dedup.model; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.model.MapDocument; import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; @@ -11,63 +10,71 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.model.MapDocument; + public class Block implements Serializable { - private String key; + private String key; - private List documents; + private List documents; - public Block() { - super(); - } + public Block() { + super(); + } - public static Block from(String key, MapDocument doc) { - Block block = new Block(); - block.setKey(key); - block.setDocuments(Lists.newArrayList(doc)); - return block; - } + public static Block from(String key, MapDocument doc) { + Block block = new Block(); + block.setKey(key); + block.setDocuments(Lists.newArrayList(doc)); + return block; + } - public static Block from(String key, Iterator blocks, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(key); + public static Block from(String key, Iterator blocks, String orderField, int maxSize) { + Block block = new Block(); + block.setKey(key); - Iterable it = () -> blocks; + Iterable it = () -> blocks; - block.setDocuments( - StreamSupport.stream(it.spliterator(), false) - .flatMap(b -> b.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); - return block; - } + block + .setDocuments( + StreamSupport + .stream(it.spliterator(), false) + .flatMap(b -> b.getDocuments().stream()) + .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); + return block; + } - public static Block from(Block b1, Block b2, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(b1.getKey()); - block.setDocuments( - Stream.concat(b1.getDocuments().stream(), b2.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); + public static Block from(Block b1, Block b2, String orderField, int maxSize) { + Block block = new Block(); + block.setKey(b1.getKey()); + block + .setDocuments( + Stream + .concat(b1.getDocuments().stream(), b2.getDocuments().stream()) + .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); - return block; - } + return block; + } - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public void setKey(String key) { - this.key = key; - } + public void setKey(String key) { + this.key = key; + } - public List getDocuments() { - return documents; - } + public List getDocuments() { + return documents; + } - public void setDocuments(List documents) { - this.documents = documents; - } + public void setDocuments(List documents) { + this.documents = documents; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java index 4236f32e3b..a217a2657f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java @@ -1,49 +1,54 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.schema.oaf.Publication; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeEach; +import eu.dnetlib.dhp.schema.oaf.Publication; + public class MergeAuthorTest { - private List publicationsToMerge; - private final ObjectMapper mapper = new ObjectMapper(); + private List publicationsToMerge; + private final ObjectMapper mapper = new ObjectMapper(); - @BeforeEach - public void setUp() throws Exception { - final String json = - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); + @BeforeEach + public void setUp() throws Exception { + final String json = IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); - publicationsToMerge = - Arrays.asList(json.split("\n")).stream() - .map( - s -> { - try { - return mapper.readValue(s, Publication.class); - } catch (IOException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } + publicationsToMerge = Arrays + .asList(json.split("\n")) + .stream() + .map( + s -> { + try { + return mapper.readValue(s, Publication.class); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + } - // FIX ME Michele DB this tests doesn't work - // @Test - public void test() throws Exception { - Publication dedup = new Publication(); + // FIX ME Michele DB this tests doesn't work + // @Test + public void test() throws Exception { + Publication dedup = new Publication(); - publicationsToMerge.forEach( - p -> { - dedup.mergeFrom(p); - dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); - }); + publicationsToMerge + .forEach( + p -> { + dedup.mergeFrom(p); + dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); + }); - System.out.println(mapper.writeValueAsString(dedup)); - } + System.out.println(mapper.writeValueAsString(dedup)); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 838e7188d5..a0ae7bc3c8 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -1,19 +1,17 @@ + package eu.dnetlib.dhp.oa.dedup; import static java.nio.file.Files.createTempDirectory; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.lenient; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; import java.nio.file.Paths; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -30,424 +28,435 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class SparkDedupTest implements Serializable { - @Mock(serializable = true) - ISLookUpService isLookUpService; + @Mock(serializable = true) + ISLookUpService isLookUpService; - private static SparkSession spark; - private static JavaSparkContext jsc; + private static SparkSession spark; + private static JavaSparkContext jsc; - private static String testGraphBasePath; - private static String testOutputBasePath; - private static String testDedupGraphBasePath; - private static final String testActionSetId = "test-orchestrator"; + private static String testGraphBasePath; + private static String testOutputBasePath; + private static String testDedupGraphBasePath; + private static final String testActionSetId = "test-orchestrator"; - @BeforeAll - public static void cleanUp() throws IOException, URISyntaxException { + @BeforeAll + public static void cleanUp() throws IOException, URISyntaxException { - testGraphBasePath = - Paths.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) - .toFile() - .getAbsolutePath(); - testOutputBasePath = - createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); - testDedupGraphBasePath = - createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); + testGraphBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) + .toFile() + .getAbsolutePath(); + testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + testDedupGraphBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + FileUtils.deleteDirectory(new File(testOutputBasePath)); + FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - spark = - SparkSession.builder() - .appName(SparkDedupTest.class.getSimpleName()) - .master("local[*]") - .config(new SparkConf()) - .getOrCreate(); + spark = SparkSession + .builder() + .appName(SparkDedupTest.class.getSimpleName()) + .master("local[*]") + .config(new SparkConf()) + .getOrCreate(); - jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } + jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } - @BeforeEach - public void setUp() throws IOException, ISLookUpException { + @BeforeEach + public void setUp() throws IOException, ISLookUpException { - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); - } + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); + } - @Test - @Order(1) - public void createSimRelsTest() throws Exception { + @Test + @Order(1) + public void createSimRelsTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateSimRels(parser, spark).run(isLookUpService); + new SparkCreateSimRels(parser, spark).run(isLookUpService); - long orgs_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") - .count(); - long pubs_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") - .count(); - long sw_simrel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); + long orgs_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") + .count(); + long pubs_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") + .count(); + long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); - long ds_simrel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count(); + long ds_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count(); - long orp_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") - .count(); + long orp_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") + .count(); - assertEquals(3432, orgs_simrel); - assertEquals(7152, pubs_simrel); - assertEquals(344, sw_simrel); - assertEquals(458, ds_simrel); - assertEquals(6750, orp_simrel); - } + assertEquals(3432, orgs_simrel); + assertEquals(7152, pubs_simrel); + assertEquals(344, sw_simrel); + assertEquals(458, ds_simrel); + assertEquals(6750, orp_simrel); + } - @Test - @Order(2) - public void createMergeRelsTest() throws Exception { + @Test + @Order(2) + public void createMergeRelsTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateMergeRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateMergeRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateMergeRels(parser, spark).run(isLookUpService); + new SparkCreateMergeRels(parser, spark).run(isLookUpService); - long orgs_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") - .count(); - long pubs_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") - .count(); - long sw_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") - .count(); + long orgs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .count(); + long pubs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .count(); + long sw_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .count(); - long ds_mergerel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count(); + long ds_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count(); - long orp_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") - .count(); + long orp_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .count(); - assertEquals(1276, orgs_mergerel); - assertEquals(1442, pubs_mergerel); - assertEquals(288, sw_mergerel); - assertEquals(472, ds_mergerel); - assertEquals(718, orp_mergerel); - } + assertEquals(1276, orgs_mergerel); + assertEquals(1442, pubs_mergerel); + assertEquals(288, sw_mergerel); + assertEquals(472, ds_mergerel); + assertEquals(718, orp_mergerel); + } - @Test - @Order(3) - public void createDedupRecordTest() throws Exception { + @Test + @Order(3) + public void createDedupRecordTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateDedupRecord.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateDedupRecord.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateDedupRecord(parser, spark).run(isLookUpService); + new SparkCreateDedupRecord(parser, spark).run(isLookUpService); - long orgs_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord") - .count(); - long pubs_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord") - .count(); - long sw_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord").count(); - long ds_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord").count(); - long orp_deduprecord = - jsc.textFile( - testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") - .count(); + long orgs_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord") + .count(); + long pubs_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord") + .count(); + long sw_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord") + .count(); + long ds_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord").count(); + long orp_deduprecord = jsc + .textFile( + testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") + .count(); - assertEquals(82, orgs_deduprecord); - assertEquals(66, pubs_deduprecord); - assertEquals(51, sw_deduprecord); - assertEquals(96, ds_deduprecord); - assertEquals(89, orp_deduprecord); - } + assertEquals(82, orgs_deduprecord); + assertEquals(66, pubs_deduprecord); + assertEquals(51, sw_deduprecord); + assertEquals(96, ds_deduprecord); + assertEquals(89, orp_deduprecord); + } - @Test - @Order(4) - public void updateEntityTest() throws Exception { + @Test + @Order(4) + public void updateEntityTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntity.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath + }); - new SparkUpdateEntity(parser, spark).run(isLookUpService); + new SparkUpdateEntity(parser, spark).run(isLookUpService); - long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); - long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); - long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); - long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); - long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); - long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count(); - long otherresearchproduct = - jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count(); + long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); + long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); + long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); + long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); + long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); + long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count(); + long otherresearchproduct = jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count(); - long mergedOrgs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedOrgs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedPubs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedPubs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedSw = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedSw = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedDs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedDs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedOrp = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedOrp = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - assertEquals(897, publications); - assertEquals(835, organizations); - assertEquals(100, projects); - assertEquals(100, datasource); - assertEquals(200, softwares); - assertEquals(388, dataset); - assertEquals(517, otherresearchproduct); + assertEquals(897, publications); + assertEquals(835, organizations); + assertEquals(100, projects); + assertEquals(100, datasource); + assertEquals(200, softwares); + assertEquals(388, dataset); + assertEquals(517, otherresearchproduct); - long deletedOrgs = - jsc.textFile(testDedupGraphBasePath + "/organization") - .filter(this::isDeletedByInference) - .count(); + long deletedOrgs = jsc + .textFile(testDedupGraphBasePath + "/organization") + .filter(this::isDeletedByInference) + .count(); - long deletedPubs = - jsc.textFile(testDedupGraphBasePath + "/publication") - .filter(this::isDeletedByInference) - .count(); + long deletedPubs = jsc + .textFile(testDedupGraphBasePath + "/publication") + .filter(this::isDeletedByInference) + .count(); - long deletedSw = - jsc.textFile(testDedupGraphBasePath + "/software") - .filter(this::isDeletedByInference) - .count(); + long deletedSw = jsc + .textFile(testDedupGraphBasePath + "/software") + .filter(this::isDeletedByInference) + .count(); - long deletedDs = - jsc.textFile(testDedupGraphBasePath + "/dataset") - .filter(this::isDeletedByInference) - .count(); + long deletedDs = jsc + .textFile(testDedupGraphBasePath + "/dataset") + .filter(this::isDeletedByInference) + .count(); - long deletedOrp = - jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct") - .filter(this::isDeletedByInference) - .count(); + long deletedOrp = jsc + .textFile(testDedupGraphBasePath + "/otherresearchproduct") + .filter(this::isDeletedByInference) + .count(); - assertEquals(mergedOrgs, deletedOrgs); - assertEquals(mergedPubs, deletedPubs); - assertEquals(mergedSw, deletedSw); - assertEquals(mergedDs, deletedDs); - assertEquals(mergedOrp, deletedOrp); - } + assertEquals(mergedOrgs, deletedOrgs); + assertEquals(mergedPubs, deletedPubs); + assertEquals(mergedSw, deletedSw); + assertEquals(mergedDs, deletedDs); + assertEquals(mergedOrp, deletedOrp); + } - @Test - @Order(5) - public void propagateRelationTest() throws Exception { + @Test + @Order(5) + public void propagateRelationTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkPropagateRelation.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkPropagateRelation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath + }); - new SparkPropagateRelation(parser, spark).run(isLookUpService); + new SparkPropagateRelation(parser, spark).run(isLookUpService); - long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); + long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(826, relations); + assertEquals(826, relations); - // check deletedbyinference - final Dataset mergeRels = - spark - .read() - .load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*")) - .as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - mergeRels - .where("relClass == 'merges'") - .select(mergeRels.col("target")) - .distinct() - .toJavaRDD() - .mapToPair( - (PairFunction) - r -> new Tuple2(r.getString(0), "d")); + // check deletedbyinference + final Dataset mergeRels = spark + .read() + .load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*")) + .as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = mergeRels + .where("relClass == 'merges'") + .select(mergeRels.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2(r.getString(0), "d")); - JavaRDD toCheck = - jsc.textFile(testDedupGraphBasePath + "/relation") - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.source", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()) - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.target", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()); + JavaRDD toCheck = jsc + .textFile(testDedupGraphBasePath + "/relation") + .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.source", json), json)) + .join(mergedIds) + .map(t -> t._2()._1()) + .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.target", json), json)) + .join(mergedIds) + .map(t -> t._2()._1()); - long deletedbyinference = toCheck.filter(this::isDeletedByInference).count(); - long updated = toCheck.count(); + long deletedbyinference = toCheck.filter(this::isDeletedByInference).count(); + long updated = toCheck.count(); - assertEquals(updated, deletedbyinference); - } + assertEquals(updated, deletedbyinference); + } - @AfterAll - public static void finalCleanUp() throws IOException { - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - } + @AfterAll + public static void finalCleanUp() throws IOException { + FileUtils.deleteDirectory(new File(testOutputBasePath)); + FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + } - public boolean isDeletedByInference(String s) { - return s.contains("\"deletedbyinference\":true"); - } + public boolean isDeletedByInference(String s) { + return s.contains("\"deletedbyinference\":true"); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java index 7c8d937ce3..9518efdb57 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java @@ -1,291 +1,292 @@ + package eu.dnetlib.dhp.oa.dedup.jpath; +import org.junit.jupiter.api.Test; + import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.junit.jupiter.api.Test; public class JsonPathTest { - String json = - "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; - DedupConfig conf = - DedupConfig.load( - "{\n" - + " \"wf\" : {\n" - + " \"threshold\" : \"0.99\",\n" - + " \"dedupRun\" : \"001\",\n" - + " \"entityType\" : \"organization\",\n" - + " \"subEntityValue\": \"organization\",\n" - + " \"orderField\" : \"legalname\",\n" - + " \"queueMaxSize\" : \"2000\",\n" - + " \"groupMaxSize\" : \"50\",\n" - + " \"slidingWindowSize\" : \"200\",\n" - + " \"idPath\":\"$.id\",\n" - + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" - + " \"includeChildren\" : \"true\",\n" - + " \"maxIterations\": \"20\"\n" - + " },\n" - + " \"pace\" : {\n" - + " \"clustering\" : [\n" - + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" - + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" - + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" - + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" - + " ],\n" - + " \"decisionTree\" : {\n" - + " \"start\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"gridid\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer2\",\n" - + " \"ignoreUndefined\": \"false\"\n" - + " },\n" - + " \"layer2\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"websiteurl\",\n" - + " \"comparator\": \"domainExactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"country\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"numbersMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"romansMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AND\",\n" - + " \"positive\": \"layer3\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer3\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer3\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"cityMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer4\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer4\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"keywordMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.7,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer5\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer5\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer5\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.9,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " },\n" - + " {\n" - + " \"field\": \"legalshortname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {\n" - + " \"windowSize\": 4\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.9,\n" - + " \"aggregation\": \"W_MEAN\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " }\n" - + " },\n" - + " \"model\" : [\n" - + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" - + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" - + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" - + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" - + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" - + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" - + " ],\n" - + " \"blacklists\" : {\n" - + " \"legalname\" : []\n" - + " },\n" - + " \"synonyms\": {\n" - + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" - + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" - + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" - + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" - + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" - + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" - + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" - + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" - + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" - + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" - + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" - + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" - + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" - + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" - + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" - + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" - + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" - + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" - + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" - + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" - + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" - + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" - + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" - + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" - + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" - + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" - + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" - + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" - + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" - + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" - + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" - + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" - + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" - + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" - + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" - + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" - + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" - + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" - + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" - + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" - + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" - + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" - + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" - + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" - + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" - + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" - + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" - + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" - + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" - + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" - + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" - + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" - + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" - + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" - + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" - + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" - + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" - + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" - + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" - + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" - + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" - + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" - + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" - + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" - + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" - + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" - + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" - + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" - + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" - + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" - + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" - + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" - + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" - + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" - + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" - + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" - + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" - + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" - + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" - + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" - + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" - + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" - + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" - + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" - + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" - + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" - + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" - + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" - + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" - + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" - + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" - + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" - + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" - + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" - + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" - + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" - + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" - + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" - + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" - + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" - + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" - + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" - + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" - + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" - + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" - + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" - + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" - + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" - + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" - + " }\n" - + " }\n" - + "}"); + String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = DedupConfig + .load( + "{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); - @Test - public void testJPath() throws Exception { + @Test + public void testJPath() throws Exception { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); - System.out.println("d = " + d); - } + System.out.println("d = " + d); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java index 8a6cdf7dcc..db55434d87 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java @@ -1,113 +1,121 @@ + package eu.dnetlib.dedup; import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; + import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; -import eu.dnetlib.dhp.schema.oaf.Field; import java.time.Year; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.lang.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Field; + public class DatePicker { - private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; - private static final String DATE_DEFAULT_SUFFIX = "01-01"; - private static final int YEAR_LB = 1300; - private static final int YEAR_UB = Year.now().getValue() + 5; + private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; + private static final String DATE_DEFAULT_SUFFIX = "01-01"; + private static final int YEAR_LB = 1300; + private static final int YEAR_UB = Year.now().getValue() + 5; - public static Field pick(final Collection dateofacceptance) { + public static Field pick(final Collection dateofacceptance) { - final Map frequencies = - dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); + final Map frequencies = dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); - if (frequencies.isEmpty()) { - return new Field<>(); - } + if (frequencies.isEmpty()) { + return new Field<>(); + } - final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + final Field date = new Field<>(); + date.setValue(frequencies.keySet().iterator().next()); - // let's sort this map by values first, filtering out invalid dates - final Map sorted = - frequencies.entrySet().stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); + // let's sort this map by values first, filtering out invalid dates + final Map sorted = frequencies + .entrySet() + .stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); - // shortcut - if (sorted.size() == 0) { - return date; - } + // shortcut + if (sorted.size() == 0) { + return date; + } - // voting method (1/3 + 1) wins - if (sorted.size() >= 3) { - final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = - sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + // voting method (1/3 + 1) wins + if (sorted.size() >= 3) { + final int acceptThreshold = (sorted.size() / 3) + 1; + final List accepted = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); - // cannot find strong majority - if (accepted.isEmpty()) { - final int max = sorted.values().iterator().next(); - Optional first = - sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + // cannot find strong majority + if (accepted.isEmpty()) { + final int max = sorted.values().iterator().next(); + Optional first = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - date.setValue(sorted.keySet().iterator().next()); - return date; - } + date.setValue(sorted.keySet().iterator().next()); + return date; + } - if (accepted.size() == 1) { - date.setValue(accepted.get(0)); - return date; - } else { - final Optional first = - accepted.stream().filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)).findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + if (accepted.size() == 1) { + date.setValue(accepted.get(0)); + return date; + } else { + final Optional first = accepted + .stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - return date; - } + return date; + } - // 1st non YYYY-01-01 is returned - } else { - if (sorted.size() == 2) { - for (Map.Entry e : sorted.entrySet()) { - if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { - date.setValue(e.getKey()); - return date; - } - } - } + // 1st non YYYY-01-01 is returned + } else { + if (sorted.size() == 2) { + for (Map.Entry e : sorted.entrySet()) { + if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { + date.setValue(e.getKey()); + return date; + } + } + } - // none of the dates seems good enough, return the 1st one - date.setValue(sorted.keySet().iterator().next()); - return date; - } - } + // none of the dates seems good enough, return the 1st one + date.setValue(sorted.keySet().iterator().next()); + return date; + } + } - private static boolean inRange(final String date) { - final int year = Integer.parseInt(substringBefore(date, "-")); - return year >= YEAR_LB && year <= YEAR_UB; - } + private static boolean inRange(final String date) { + final int year = Integer.parseInt(substringBefore(date, "-")); + return year >= YEAR_LB && year <= YEAR_UB; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java index 782aa174fe..d03cc25895 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -1,304 +1,318 @@ + package eu.dnetlib.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.Collection; + import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class DedupRecordFactory { - public static JavaRDD createDedupRecord( - final JavaSparkContext sc, - final SparkSession spark, - final String mergeRelsInputPath, - final String entitiesInputPath, - final OafEntityType entityType, - final DedupConfig dedupConf) { - long ts = System.currentTimeMillis(); - // - final JavaPairRDD inputJsonEntities = - sc.textFile(entitiesInputPath) - .mapToPair( - (PairFunction) - it -> - new Tuple2( - MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)); + public static JavaRDD createDedupRecord( + final JavaSparkContext sc, + final SparkSession spark, + final String mergeRelsInputPath, + final String entitiesInputPath, + final OafEntityType entityType, + final DedupConfig dedupConf) { + long ts = System.currentTimeMillis(); + // + final JavaPairRDD inputJsonEntities = sc + .textFile(entitiesInputPath) + .mapToPair( + (PairFunction) it -> new Tuple2( + MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)); - // : source is the dedup_id, target is the id of the mergedIn - JavaPairRDD mergeRels = - spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .mapToPair( - (PairFunction) - r -> new Tuple2(r.getTarget(), r.getSource())); + // : source is the dedup_id, target is the id of the mergedIn + JavaPairRDD mergeRels = spark + .read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2(r.getTarget(), r.getSource())); - // - final JavaPairRDD joinResult = - mergeRels - .join(inputJsonEntities) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); + // + final JavaPairRDD joinResult = mergeRels + .join(inputJsonEntities) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); - JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); + JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); - switch (entityType) { - case publication: - return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts)); - case dataset: - return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts)); - case project: - return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts)); - case software: - return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts)); - case datasource: - return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts)); - case organization: - return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); - case otherresearchproduct: - return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); - default: - return null; - } - } + switch (entityType) { + case publication: + return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts)); + case dataset: + return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts)); + case project: + return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts)); + case software: + return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts)); + case datasource: + return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts)); + case organization: + return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); + case otherresearchproduct: + return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); + default: + return null; + } + } - private static Publication publicationMerger(Tuple2> e, final long ts) { + private static Publication publicationMerger(Tuple2> e, final long ts) { - Publication p = new Publication(); // the result of the merge, to be returned at the end + Publication p = new Publication(); // the result of the merge, to be returned at the end - p.setId(e._1()); + p.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - pub -> { - try { - Publication publication = mapper.readValue(pub, Publication.class); + if (e._2() != null) + e + ._2() + .forEach( + pub -> { + try { + Publication publication = mapper.readValue(pub, Publication.class); - p.mergeFrom(publication); - p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); - // add to the list if they are not null - if (publication.getDateofacceptance() != null) - dateofacceptance.add(publication.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - p.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } + p.mergeFrom(publication); + p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); + // add to the list if they are not null + if (publication.getDateofacceptance() != null) + dateofacceptance.add(publication.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + p.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (p.getDataInfo() == null) + p.setDataInfo(new DataInfo()); + p.getDataInfo().setTrust("0.9"); + p.setLastupdatetimestamp(ts); + return p; + } - private static Dataset datasetMerger(Tuple2> e, final long ts) { + private static Dataset datasetMerger(Tuple2> e, final long ts) { - Dataset d = new Dataset(); // the result of the merge, to be returned at the end + Dataset d = new Dataset(); // the result of the merge, to be returned at the end - d.setId(e._1()); + d.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - dat -> { - try { - Dataset dataset = mapper.readValue(dat, Dataset.class); + if (e._2() != null) + e + ._2() + .forEach( + dat -> { + try { + Dataset dataset = mapper.readValue(dat, Dataset.class); - d.mergeFrom(dataset); - d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); - // add to the list if they are not null - if (dataset.getDateofacceptance() != null) - dateofacceptance.add(dataset.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - d.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } + d.mergeFrom(dataset); + d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); + // add to the list if they are not null + if (dataset.getDateofacceptance() != null) + dateofacceptance.add(dataset.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + d.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (d.getDataInfo() == null) + d.setDataInfo(new DataInfo()); + d.getDataInfo().setTrust("0.9"); + d.setLastupdatetimestamp(ts); + return d; + } - private static Project projectMerger(Tuple2> e, final long ts) { + private static Project projectMerger(Tuple2> e, final long ts) { - Project p = new Project(); // the result of the merge, to be returned at the end + Project p = new Project(); // the result of the merge, to be returned at the end - p.setId(e._1()); + p.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e._2() - .forEach( - proj -> { - try { - Project project = mapper.readValue(proj, Project.class); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + if (e._2() != null) + e + ._2() + .forEach( + proj -> { + try { + Project project = mapper.readValue(proj, Project.class); - p.mergeFrom(project); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } + p.mergeFrom(project); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (p.getDataInfo() == null) + p.setDataInfo(new DataInfo()); + p.getDataInfo().setTrust("0.9"); + p.setLastupdatetimestamp(ts); + return p; + } - private static Software softwareMerger(Tuple2> e, final long ts) { + private static Software softwareMerger(Tuple2> e, final long ts) { - Software s = new Software(); // the result of the merge, to be returned at the end + Software s = new Software(); // the result of the merge, to be returned at the end - s.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - soft -> { - try { - Software software = mapper.readValue(soft, Software.class); + s.setId(e._1()); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final Collection dateofacceptance = Lists.newArrayList(); + if (e._2() != null) + e + ._2() + .forEach( + soft -> { + try { + Software software = mapper.readValue(soft, Software.class); - s.mergeFrom(software); - s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); - // add to the list if they are not null - if (software.getDateofacceptance() != null) - dateofacceptance.add(software.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - s.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (s.getDataInfo() == null) s.setDataInfo(new DataInfo()); - s.getDataInfo().setTrust("0.9"); - s.setLastupdatetimestamp(ts); - return s; - } + s.mergeFrom(software); + s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); + // add to the list if they are not null + if (software.getDateofacceptance() != null) + dateofacceptance.add(software.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + s.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (s.getDataInfo() == null) + s.setDataInfo(new DataInfo()); + s.getDataInfo().setTrust("0.9"); + s.setLastupdatetimestamp(ts); + return s; + } - private static Datasource datasourceMerger(Tuple2> e, final long ts) { - Datasource d = new Datasource(); // the result of the merge, to be returned at the end - d.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e._2() - .forEach( - dat -> { - try { - Datasource datasource = mapper.readValue(dat, Datasource.class); + private static Datasource datasourceMerger(Tuple2> e, final long ts) { + Datasource d = new Datasource(); // the result of the merge, to be returned at the end + d.setId(e._1()); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + if (e._2() != null) + e + ._2() + .forEach( + dat -> { + try { + Datasource datasource = mapper.readValue(dat, Datasource.class); - d.mergeFrom(datasource); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } + d.mergeFrom(datasource); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (d.getDataInfo() == null) + d.setDataInfo(new DataInfo()); + d.getDataInfo().setTrust("0.9"); + d.setLastupdatetimestamp(ts); + return d; + } - private static Organization organizationMerger( - Tuple2> e, final long ts) { + private static Organization organizationMerger( + Tuple2> e, final long ts) { - Organization o = new Organization(); // the result of the merge, to be returned at the end + Organization o = new Organization(); // the result of the merge, to be returned at the end - o.setId(e._1()); + o.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - StringBuilder trust = new StringBuilder("0.0"); + StringBuilder trust = new StringBuilder("0.0"); - if (e._2() != null) - e._2() - .forEach( - pub -> { - try { - Organization organization = mapper.readValue(pub, Organization.class); + if (e._2() != null) + e + ._2() + .forEach( + pub -> { + try { + Organization organization = mapper.readValue(pub, Organization.class); - final String currentTrust = organization.getDataInfo().getTrust(); - if (!"1.0".equals(currentTrust)) { - trust.setLength(0); - trust.append(currentTrust); - } - o.mergeFrom(organization); + final String currentTrust = organization.getDataInfo().getTrust(); + if (!"1.0".equals(currentTrust)) { + trust.setLength(0); + trust.append(currentTrust); + } + o.mergeFrom(organization); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); - if (o.getDataInfo() == null) { - o.setDataInfo(new DataInfo()); - } - if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); + if (o.getDataInfo() == null) { + o.setDataInfo(new DataInfo()); + } + if (o.getDataInfo() == null) + o.setDataInfo(new DataInfo()); + o.getDataInfo().setTrust("0.9"); + o.setLastupdatetimestamp(ts); - return o; - } + return o; + } - private static OtherResearchProduct otherresearchproductMerger( - Tuple2> e, final long ts) { + private static OtherResearchProduct otherresearchproductMerger( + Tuple2> e, final long ts) { - OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be - // returned at the end + OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be + // returned at the end - o.setId(e._1()); + o.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - orp -> { - try { - OtherResearchProduct otherResearchProduct = - mapper.readValue(orp, OtherResearchProduct.class); + if (e._2() != null) + e + ._2() + .forEach( + orp -> { + try { + OtherResearchProduct otherResearchProduct = mapper + .readValue(orp, OtherResearchProduct.class); - o.mergeFrom(otherResearchProduct); - o.setAuthor( - DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); - // add to the list if they are not null - if (otherResearchProduct.getDateofacceptance() != null) - dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); - o.setDateofacceptance(DatePicker.pick(dateofacceptance)); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); - return o; - } + o.mergeFrom(otherResearchProduct); + o + .setAuthor( + DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); + // add to the list if they are not null + if (otherResearchProduct.getDateofacceptance() != null) + dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (o.getDataInfo() == null) + o.setDataInfo(new DataInfo()); + o.setDateofacceptance(DatePicker.pick(dateofacceptance)); + o.getDataInfo().setTrust("0.9"); + o.setLastupdatetimestamp(ts); + return o; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java index 60f0a50f74..70a2e35917 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -1,13 +1,6 @@ + package eu.dnetlib.dedup; -import com.google.common.collect.Sets; -import com.wcohen.ss.JaroWinkler; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.Person; import java.io.IOException; import java.io.StringWriter; import java.nio.charset.StandardCharsets; @@ -15,6 +8,7 @@ import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -26,205 +20,220 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; + +import com.google.common.collect.Sets; +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.Person; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { - Map accumulators = new HashMap<>(); + Map accumulators = new HashMap<>(); - String acc1 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String.format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = - String.format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); - return accumulators; - } + return accumulators; + } - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } + public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { + return context.textFile(path); + } - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } + public static void deleteIfExists(String path) throws IOException { + Configuration conf = new Configuration(); + FileSystem fileSystem = FileSystem.get(conf); + if (fileSystem.exists(new Path(path))) { + fileSystem.delete(new Path(path), true); + } + } - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { + public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); + Configuration conf = new Configuration(); + FileSystem fileSystem = FileSystem.get(conf); + FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - } + return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); + } - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } + static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } + static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) return; - final Map basePidAuthorMap = - base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = - enrich.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> - a.getPid().stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich.forEach( - a -> { - Optional> simAuhtor = - base.stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } + pidToEnrich + .forEach( + a -> { + Optional> simAuhtor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); + } - public static String createEntityPath(final String basePath, final String entityType) { - return String.format("%s/%s", basePath, entityType); - } + public static String createEntityPath(final String basePath, final String entityType) { + return String.format("%s/%s", basePath, entityType); + } - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/simRel", basePath, entityType); - } + public static String createSimRelPath(final String basePath, final String entityType) { + return String.format("%s/%s/simRel", basePath, entityType); + } - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/mergeRel", basePath, entityType); - } + public static String createMergeRelPath(final String basePath, final String entityType) { + return String.format("%s/%s/mergeRel", basePath, entityType); + } - private static Double sim(Author a, Author b) { + private static Double sim(Author a, Author b) { - final Person pa = parse(a); - final Person pb = parse(b); + final Person pa = parse(a); + final Person pb = parse(b); - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } - private static int countAuthorsPids(List authors) { - if (authors == null) return 0; + private static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } + return (int) authors.stream().filter(DedupUtility::hasPid).count(); + } - private static int authorsSize(List authors) { - if (authors == null) return 0; - return authors.size(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java index 681a4168aa..e7d49be988 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.BlockProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaPairRDD; @@ -15,170 +13,170 @@ import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.BlockProcessor; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Serializable; import scala.Tuple2; public class Deduper implements Serializable { - private static final Log log = LogFactory.getLog(Deduper.class); + private static final Log log = LogFactory.getLog(Deduper.class); - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of JSON entities to be deduped - * @param: the dedup configuration - */ - public static JavaPairRDD dedup( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { + /** + * @return the list of relations generated by the deduplication + * @param: the spark context + * @param: list of JSON entities to be deduped + * @param: the dedup configuration + */ + public static JavaPairRDD dedup( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - // create vertexes of the graph: - JavaPairRDD mapDocs = mapToVertexes(context, entities, config); + // create vertexes of the graph: + JavaPairRDD mapDocs = mapToVertexes(context, entities, config); - // create blocks for deduplication - JavaPairRDD> blocks = createBlocks(context, mapDocs, config); + // create blocks for deduplication + JavaPairRDD> blocks = createBlocks(context, mapDocs, config); - // create relations by comparing only elements in the same group - return computeRelations(context, blocks, config); + // create relations by comparing only elements in the same group + return computeRelations(context, blocks, config); - // final RDD> edgeRdd = relationRDD.map(it -> new - // Edge<>(it._1().hashCode(), - // it._2().hashCode(), "equalTo")).rdd(); - // - // RDD> vertexes = - // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> - // new - // Tuple2((long) t._1().hashCode(), t._2())).rdd(); - // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); - // - // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); - } + // final RDD> edgeRdd = relationRDD.map(it -> new + // Edge<>(it._1().hashCode(), + // it._2().hashCode(), "equalTo")).rdd(); + // + // RDD> vertexes = + // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> + // new + // Tuple2((long) t._1().hashCode(), t._2())).rdd(); + // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); + // + // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + } - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of blocks - * @param: the dedup configuration - */ - public static JavaPairRDD computeRelations( - JavaSparkContext context, - JavaPairRDD> blocks, - DedupConfig config) { + /** + * @return the list of relations generated by the deduplication + * @param: the spark context + * @param: list of blocks + * @param: the dedup configuration + */ + public static JavaPairRDD computeRelations( + JavaSparkContext context, + JavaPairRDD> blocks, + DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) - it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).process(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair( - (PairFunction, String, Tuple2>) - item -> new Tuple2>(item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } + return blocks + .flatMapToPair( + (PairFlatMapFunction>, String, String>) it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config).process(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + }) + .mapToPair( + (PairFunction, String, Tuple2>) item -> new Tuple2>( + item._1() + item._2(), item)) + .reduceByKey((a, b) -> a) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); + } - /** - * @return the list of blocks based on clustering of dedup configuration - * @param: the spark context - * @param: list of entities: - * @param: the dedup configuration - */ - public static JavaPairRDD> createBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction) - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map(it -> new Tuple2<>(it, a)) - .collect(Collectors.toList()) - .iterator()) - .groupByKey(); - } + /** + * @return the list of blocks based on clustering of dedup configuration + * @param: the spark context + * @param: list of entities: + * @param: the dedup configuration + */ + public static JavaPairRDD> createBlocks( + JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction) a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map(it -> new Tuple2<>(it, a)) + .collect(Collectors.toList()) + .iterator()) + .groupByKey(); + } - public static JavaPairRDD> createsortedBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getGroupMaxSize(); - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction>) - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map( - it -> { - List tmp = new ArrayList<>(); - tmp.add(a); - return new Tuple2<>(it, tmp); - }) - .collect(Collectors.toList()) - .iterator()) - .reduceByKey( - (Function2, List, List>) - (v1, v2) -> { - v1.addAll(v2); - v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); - if (v1.size() > maxQueueSize) return new ArrayList<>(v1.subList(0, maxQueueSize)); - return v1; - }); - } + public static JavaPairRDD> createsortedBlocks( + JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + final String of = config.getWf().getOrderField(); + final int maxQueueSize = config.getWf().getGroupMaxSize(); + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction>) a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map( + it -> { + List tmp = new ArrayList<>(); + tmp.add(a); + return new Tuple2<>(it, tmp); + }) + .collect(Collectors.toList()) + .iterator()) + .reduceByKey( + (Function2, List, List>) (v1, v2) -> { + v1.addAll(v2); + v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); + if (v1.size() > maxQueueSize) + return new ArrayList<>(v1.subList(0, maxQueueSize)); + return v1; + }); + } - /** - * @return the list of vertexes: - * @param: the spark context - * @param: list of JSON entities - * @param: the dedup configuration - */ - public static JavaPairRDD mapToVertexes( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { + /** + * @return the list of vertexes: + * @param: the spark context + * @param: list of JSON entities + * @param: the dedup configuration + */ + public static JavaPairRDD mapToVertexes( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - return entities.mapToPair( - (PairFunction) - s -> { - MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); - return new Tuple2(mapDocument.getIdentifier(), mapDocument); - }); - } + return entities + .mapToPair( + (PairFunction) s -> { + MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); + return new Tuple2(mapDocument.getIdentifier(), mapDocument); + }); + } - public static JavaPairRDD computeRelations2( - JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations2( + JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) - it -> { - try { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - } catch (Exception e) { - throw new RuntimeException(it._2().get(0).getIdentifier(), e); - } - }) - .mapToPair( - (PairFunction, String, Tuple2>) - item -> new Tuple2>(item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } + return blocks + .flatMapToPair( + (PairFlatMapFunction>, String, String>) it -> { + try { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + } catch (Exception e) { + throw new RuntimeException(it._2().get(0).getIdentifier(), e); + } + }) + .mapToPair( + (PairFunction, String, Tuple2>) item -> new Tuple2>( + item._1() + item._2(), item)) + .reduceByKey((a, b) -> a) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java index 72c771a133..bc99481901 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dedup; public enum OafEntityType { - datasource, - organization, - project, - dataset, - otherresearchproduct, - software, - publication + datasource, organization, project, dataset, otherresearchproduct, software, publication } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 1039b8636f..f86410d292 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -1,14 +1,9 @@ + package eu.dnetlib.dedup; -import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.ArrayList; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -20,86 +15,93 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import com.google.common.hash.Hashing; + +import eu.dnetlib.dedup.graph.ConnectedComponent; +import eu.dnetlib.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkCreateConnectedComponent { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateConnectedComponent.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateConnectedComponent.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateConnectedComponent.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - // final DedupConfig dedupConf = - // DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String targetPath = parser.get("targetPath"); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaPairRDD vertexes = - sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair( - (PairFunction) - s -> new Tuple2(getHashcode(s), s)); + final JavaPairRDD vertexes = sc + .textFile(inputPath + "/" + entity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair( + (PairFunction) s -> new Tuple2(getHashcode(s), s)); - final Dataset similarityRelations = - spark - .read() - .load(DedupUtility.createSimRelPath(targetPath, entity)) - .as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = - similarityRelations - .javaRDD() - .map( - it -> - new Edge<>( - getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())) - .rdd(); - final JavaRDD cc = - GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) - .toJavaRDD(); - final Dataset mergeRelation = - spark.createDataset( - cc.filter(k -> k.getDocIds().size() > 1) - .flatMap( - (FlatMapFunction) - c -> - c.getDocIds().stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }) - .iterator()) - .rdd(), - Encoders.bean(Relation.class)); - mergeRelation - .write() - .mode("overwrite") - .save(DedupUtility.createMergeRelPath(targetPath, entity)); - } + final Dataset similarityRelations = spark + .read() + .load(DedupUtility.createSimRelPath(targetPath, entity)) + .as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations + .javaRDD() + .map( + it -> new Edge<>( + getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())) + .rdd(); + final JavaRDD cc = GraphProcessor + .findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) + .toJavaRDD(); + final Dataset mergeRelation = spark + .createDataset( + cc + .filter(k -> k.getDocIds().size() > 1) + .flatMap( + (FlatMapFunction) c -> c + .getDocIds() + .stream() + .flatMap( + id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }) + .iterator()) + .rdd(), + Encoders.bean(Relation.class)); + mergeRelation + .write() + .mode("overwrite") + .save(DedupUtility.createMergeRelPath(targetPath, entity)); + } - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } + public static long getHashcode(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index dbc97466d8..d87269f033 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -1,48 +1,52 @@ + package eu.dnetlib.dedup; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.pace.config.DedupConfig; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.pace.config.DedupConfig; + public class SparkCreateDedupRecord { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateDedupRecord.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateDedupRecord.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String sourcePath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String dedupPath = parser.get("dedupPath"); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaRDD dedupRecord = - DedupRecordFactory.createDedupRecord( - sc, - spark, - DedupUtility.createMergeRelPath(dedupPath, entity), - DedupUtility.createEntityPath(sourcePath, entity), - OafEntityType.valueOf(entity), - dedupConf); - dedupRecord - .map( - r -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }) - .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); - } + final JavaRDD dedupRecord = DedupRecordFactory + .createDedupRecord( + sc, + spark, + DedupUtility.createMergeRelPath(dedupPath, entity), + DedupUtility.createEntityPath(sourcePath, entity), + OafEntityType.valueOf(entity), + dedupConf); + dedupRecord + .map( + r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }) + .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index b8df49af3b..41fe911e7e 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,80 +1,83 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; /** * This Spark class creates similarity relations between entities, saving result - * - *

param request: sourcePath entityType target Path + *

+ * param request: sourcePath entityType target Path */ public class SparkCreateSimRels { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - // final DedupConfig dedupConf = - // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String targetPath = parser.get("targetPath"); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - JavaPairRDD mapDocument = - sc.textFile(inputPath + "/" + entity) - .mapToPair( - s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocument = sc + .textFile(inputPath + "/" + entity) + .mapToPair( + s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - // create blocks for deduplication - JavaPairRDD> blocks = - Deduper.createsortedBlocks(sc, mapDocument, dedupConf); - // JavaPairRDD> blocks = Deduper.createBlocks(sc, - // mapDocument, dedupConf); + // create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + // JavaPairRDD> blocks = Deduper.createBlocks(sc, + // mapDocument, dedupConf); - // create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, - // dedupConf); + // create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, + // dedupConf); - final JavaRDD isSimilarToRDD = - dedupRels.map( - simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); + final JavaRDD isSimilarToRDD = dedupRels + .map( + simRel -> { + final Relation r = new Relation(); + r.setSource(simRel._1()); + r.setTarget(simRel._2()); + r.setRelClass("isSimilarTo"); + return r; + }); - spark - .createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) - .write() - .mode("overwrite") - .save(DedupUtility.createSimRelPath(targetPath, entity)); - } + spark + .createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(targetPath, entity)); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java index d92eef2d4c..21e72b5b8d 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java @@ -1,49 +1,52 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.pace.util.Reporter; import java.util.ArrayList; import java.util.List; import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.util.Reporter; import scala.Serializable; import scala.Tuple2; public class SparkReporter implements Serializable, Reporter { - final List> relations = new ArrayList<>(); - private static final Log log = LogFactory.getLog(SparkReporter.class); - Map accumulators; + final List> relations = new ArrayList<>(); + private static final Log log = LogFactory.getLog(SparkReporter.class); + Map accumulators; - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { - incrementCounter(counterGroup, counterName, delta, accumulators); - } + incrementCounter(counterGroup, counterName, delta, accumulators); + } - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } - public List> getRelations() { - return relations; - } + public List> getRelations() { + return relations; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java index 444d987d87..79a3114fda 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java @@ -1,78 +1,84 @@ + package eu.dnetlib.dedup.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; -import eu.dnetlib.pace.util.PaceException; import java.io.IOException; import java.io.Serializable; import java.util.Set; + import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.pace.util.PaceException; + public class ConnectedComponent implements Serializable { - private Set docIds; - private String ccId; + private Set docIds; + private String ccId; - public ConnectedComponent() {} + public ConnectedComponent() { + } - public ConnectedComponent(Set docIds) { - this.docIds = docIds; - createID(); - } + public ConnectedComponent(Set docIds) { + this.docIds = docIds; + createID(); + } - public String createID() { - if (docIds.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); - return ccId; - } else { - return docIds.iterator().next(); - } - } + public String createID() { + if (docIds.size() > 1) { + final String s = getMin(); + String prefix = s.split("\\|")[0]; + ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); + return ccId; + } else { + return docIds.iterator().next(); + } + } - @JsonIgnore - public String getMin() { + @JsonIgnore + public String getMin() { - final StringBuilder min = new StringBuilder(); - docIds.forEach( - i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); - return min.toString(); - } + final StringBuilder min = new StringBuilder(); + docIds + .forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); + return min.toString(); + } - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } + @Override + public String toString() { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Failed to create Json: ", e); + } + } - public Set getDocIds() { - return docIds; - } + public Set getDocIds() { + return docIds; + } - public void setDocIds(Set docIds) { - this.docIds = docIds; - } + public void setDocIds(Set docIds) { + this.docIds = docIds; + } - public String getCcId() { - return ccId; - } + public String getCcId() { + return ccId; + } - public void setCcId(String ccId) { - this.ccId = ccId; - } + public void setCcId(String ccId) { + this.ccId = ccId; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java index 54a2d5dba1..e3d4fdbe36 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java @@ -1,110 +1,112 @@ + package eu.dnetlib.dedup.sx; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkPropagateRelationsJob { - enum FieldType { - SOURCE, - TARGET - } + enum FieldType { + SOURCE, TARGET + } - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkPropagateRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkPropagateRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String relationPath = parser.get("relationPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String targetRelPath = parser.get("targetRelPath"); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String relationPath = parser.get("relationPath"); + final String mergeRelPath = parser.get("mergeRelPath"); + final String targetRelPath = parser.get("targetRelPath"); - final Dataset merge = - spark - .read() - .load(mergeRelPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'"); + final Dataset merge = spark + .read() + .load(mergeRelPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'"); - final Dataset rels = - spark.read().load(relationPath).as(Encoders.bean(Relation.class)); + final Dataset rels = spark.read().load(relationPath).as(Encoders.bean(Relation.class)); - final Dataset firstJoin = - rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") - .map( - (MapFunction, Relation>) - r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); + final Dataset firstJoin = rels + .joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") + .map( + (MapFunction, Relation>) r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); - if (mergeRelation != null) relation.setSource(mergeRelation.getSource()); - return relation; - }, - Encoders.bean(Relation.class)); + if (mergeRelation != null) + relation.setSource(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); - final Dataset secondJoin = - firstJoin - .joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map( - (MapFunction, Relation>) - r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - if (mergeRelation != null) relation.setTarget(mergeRelation.getSource()); - return relation; - }, - Encoders.bean(Relation.class)); + final Dataset secondJoin = firstJoin + .joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") + .map( + (MapFunction, Relation>) r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); + if (mergeRelation != null) + relation.setTarget(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); - secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); - } + secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); + } - private static boolean containsDedup(final String json) { - final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json); - final String target = DHPUtils.getJPathString(TARGETJSONPATH, json); + private static boolean containsDedup(final String json) { + final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json); + final String target = DHPUtils.getJPathString(TARGETJSONPATH, json); - return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); - } + return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); + } - private static String replaceField(final String json, final String id, final FieldType type) { - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - try { - Relation relation = mapper.readValue(json, Relation.class); - if (relation.getDataInfo() == null) relation.setDataInfo(new DataInfo()); - relation.getDataInfo().setDeletedbyinference(false); - switch (type) { - case SOURCE: - relation.setSource(id); - return mapper.writeValueAsString(relation); - case TARGET: - relation.setTarget(id); - return mapper.writeValueAsString(relation); - default: - throw new IllegalArgumentException(""); - } - } catch (IOException e) { - throw new RuntimeException("unable to deserialize json relation: " + json, e); - } - } + private static String replaceField(final String json, final String id, final FieldType type) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Relation relation = mapper.readValue(json, Relation.class); + if (relation.getDataInfo() == null) + relation.setDataInfo(new DataInfo()); + relation.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + relation.setSource(id); + return mapper.writeValueAsString(relation); + case TARGET: + relation.setTarget(id); + return mapper.writeValueAsString(relation); + default: + throw new IllegalArgumentException(""); + } + } catch (IOException e) { + throw new RuntimeException("unable to deserialize json relation: " + json, e); + } + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java index 6ebdb05727..a847ad6125 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java @@ -1,7 +1,19 @@ + package eu.dnetlib.dedup.sx; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.*; + import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Oaf; @@ -10,90 +22,81 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.utils.DHPUtils; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.*; import scala.Tuple2; public class SparkUpdateEntityJob { - static final String IDJSONPATH = "$.id"; + static final String IDJSONPATH = "$.id"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntityJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntityJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String entityPath = parser.get("entityPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String dedupRecordPath = parser.get("dedupRecordPath"); - final String entity = parser.get("entity"); - final String destination = parser.get("targetPath"); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String entityPath = parser.get("entityPath"); + final String mergeRelPath = parser.get("mergeRelPath"); + final String dedupRecordPath = parser.get("dedupRecordPath"); + final String entity = parser.get("entity"); + final String destination = parser.get("targetPath"); - final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - df.where("relClass == 'merges'") - .select(df.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - final JavaRDD sourceEntity = sc.textFile(entityPath); + final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = df + .where("relClass == 'merges'") + .select(df.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final JavaRDD sourceEntity = sc.textFile(entityPath); - final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); - JavaPairRDD entitiesWithId = - sourceEntity.mapToPair( - (PairFunction) - s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); - Class mainClass; - switch (entity) { - case "publication": - mainClass = DLIPublication.class; - break; - case "dataset": - mainClass = DLIDataset.class; - break; - case "unknown": - mainClass = DLIUnknown.class; - break; - default: - throw new IllegalArgumentException("Illegal type " + entity); - } - JavaRDD map = - entitiesWithId - .leftOuterJoin(mergedIds) - .map( - k -> - k._2()._2().isPresent() - ? updateDeletedByInference(k._2()._1(), mainClass) - : k._2()._1()); - map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); - } + final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); + JavaPairRDD entitiesWithId = sourceEntity + .mapToPair( + (PairFunction) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); + Class mainClass; + switch (entity) { + case "publication": + mainClass = DLIPublication.class; + break; + case "dataset": + mainClass = DLIDataset.class; + break; + case "unknown": + mainClass = DLIUnknown.class; + break; + default: + throw new IllegalArgumentException("Illegal type " + entity); + } + JavaRDD map = entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> k._2()._2().isPresent() + ? updateDeletedByInference(k._2()._1(), mainClass) + : k._2()._1()); + map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); + } - private static String updateDeletedByInference( - final String json, final Class clazz) { - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - try { - Oaf entity = mapper.readValue(json, clazz); - if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); - entity.getDataInfo().setDeletedbyinference(true); - return mapper.writeValueAsString(entity); - } catch (IOException e) { - throw new RuntimeException("Unable to convert json", e); - } - } + private static String updateDeletedByInference( + final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo() == null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java index df46ce76d1..5e63ad8473 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java @@ -1,8 +1,9 @@ + package eu.dnetlib.doiboost.crossref; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.ByteArrayOutputStream; import java.util.zip.Inflater; + import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -14,86 +15,89 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + public class CrossrefImporter { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - CrossrefImporter.class.getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/import_from_es.json"))); - Logger logger = LoggerFactory.getLogger(CrossrefImporter.class); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + CrossrefImporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/import_from_es.json"))); + Logger logger = LoggerFactory.getLogger(CrossrefImporter.class); + parser.parseArgument(args); - final String hdfsuri = parser.get("namenode"); - logger.info("HDFS URI" + hdfsuri); - Path hdfswritepath = new Path(parser.get("targetPath")); - logger.info("TargetPath: " + hdfsuri); + final String hdfsuri = parser.get("namenode"); + logger.info("HDFS URI" + hdfsuri); + Path hdfswritepath = new Path(parser.get("targetPath")); + logger.info("TargetPath: " + hdfsuri); - final Long timestamp = - StringUtils.isNotBlank(parser.get("timestamp")) - ? Long.parseLong(parser.get("timestamp")) - : -1; + final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp")) + ? Long.parseLong(parser.get("timestamp")) + : -1; - if (timestamp > 0) logger.info("Timestamp added " + timestamp); + if (timestamp > 0) + logger.info("Timestamp added " + timestamp); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - ESClient client = - timestamp > 0 - ? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp) - : new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref"); + ESClient client = timestamp > 0 + ? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp) + : new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref"); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { - int i = 0; - long start = System.currentTimeMillis(); - long end = 0; - final IntWritable key = new IntWritable(i); - final Text value = new Text(); - while (client.hasNext()) { - key.set(i++); - value.set(client.next()); - writer.append(key, value); - if (i % 1000000 == 0) { - end = System.currentTimeMillis(); - final float time = (end - start) / 1000.0F; - logger.info( - String.format("Imported %d records last 100000 imported in %f seconds", i, time)); - start = System.currentTimeMillis(); - } - } - } - } + int i = 0; + long start = System.currentTimeMillis(); + long end = 0; + final IntWritable key = new IntWritable(i); + final Text value = new Text(); + while (client.hasNext()) { + key.set(i++); + value.set(client.next()); + writer.append(key, value); + if (i % 1000000 == 0) { + end = System.currentTimeMillis(); + final float time = (end - start) / 1000.0F; + logger + .info( + String.format("Imported %d records last 100000 imported in %f seconds", i, time)); + start = System.currentTimeMillis(); + } + } + } + } - public static String decompressBlob(final String blob) { - try { - byte[] byteArray = Base64.decodeBase64(blob.getBytes()); - final Inflater decompresser = new Inflater(); - decompresser.setInput(byteArray); - final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); - byte[] buffer = new byte[8192]; - while (!decompresser.finished()) { - int size = decompresser.inflate(buffer); - bos.write(buffer, 0, size); - } - byte[] unzippeddata = bos.toByteArray(); - decompresser.end(); - return new String(unzippeddata); - } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob, e); - } - } + public static String decompressBlob(final String blob) { + try { + byte[] byteArray = Base64.decodeBase64(blob.getBytes()); + final Inflater decompresser = new Inflater(); + decompresser.setInput(byteArray); + final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); + byte[] buffer = new byte[8192]; + while (!decompresser.finished()) { + int size = decompresser.inflate(buffer); + bos.write(buffer, 0, size); + } + byte[] unzippeddata = bos.toByteArray(); + decompresser.end(); + return new String(unzippeddata); + } catch (Throwable e) { + throw new RuntimeException("Wrong record:" + blob, e); + } + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java index 125dfc034b..e31ccf399b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java @@ -1,9 +1,10 @@ + package eu.dnetlib.doiboost.crossref; -import com.jayway.jsonpath.JsonPath; import java.io.IOException; import java.util.Iterator; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; @@ -13,102 +14,101 @@ import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.jayway.jsonpath.JsonPath; + public class ESClient implements Iterator { - private static final Logger logger = LoggerFactory.getLogger(ESClient.class); + private static final Logger logger = LoggerFactory.getLogger(ESClient.class); - static final String blobPath = "$.hits[*].hits[*]._source.blob"; - static final String scrollIdPath = "$._scroll_id"; - static final String JSON_NO_TS = "{\"size\":1000}"; - static final String JSON_WITH_TS = - "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; - static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; + static final String blobPath = "$.hits[*].hits[*]._source.blob"; + static final String scrollIdPath = "$._scroll_id"; + static final String JSON_NO_TS = "{\"size\":1000}"; + static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; + static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; - private final String scrollId; + private final String scrollId; - private List buffer; + private List buffer; - private final String esHost; + private final String esHost; - public ESClient(final String esHost, final String esIndex) throws IOException { + public ESClient(final String esHost, final String esIndex) throws IOException { - this.esHost = esHost; - final String body = - getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } + this.esHost = esHost; + final String body = getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS); + scrollId = getJPathString(scrollIdPath, body); + buffer = getBlobs(body); + } - public ESClient(final String esHost, final String esIndex, final long timestamp) - throws IOException { - this.esHost = esHost; - final String body = - getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), - String.format(JSON_WITH_TS, timestamp)); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } + public ESClient(final String esHost, final String esIndex, final long timestamp) + throws IOException { + this.esHost = esHost; + final String body = getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + String.format(JSON_WITH_TS, timestamp)); + scrollId = getJPathString(scrollIdPath, body); + buffer = getBlobs(body); + } - private String getResponse(final String url, final String json) { - CloseableHttpClient client = HttpClients.createDefault(); - try { + private String getResponse(final String url, final String json) { + CloseableHttpClient client = HttpClients.createDefault(); + try { - HttpPost httpPost = new HttpPost(url); - if (json != null) { - StringEntity entity = new StringEntity(json); - httpPost.setEntity(entity); - httpPost.setHeader("Accept", "application/json"); - httpPost.setHeader("Content-type", "application/json"); - } - CloseableHttpResponse response = client.execute(httpPost); + HttpPost httpPost = new HttpPost(url); + if (json != null) { + StringEntity entity = new StringEntity(json); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + CloseableHttpResponse response = client.execute(httpPost); - return IOUtils.toString(response.getEntity().getContent()); - } catch (Throwable e) { - throw new RuntimeException("Error on executing request ", e); - } finally { - try { - client.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close client ", e); - } - } - } + return IOUtils.toString(response.getEntity().getContent()); + } catch (Throwable e) { + throw new RuntimeException("Error on executing request ", e); + } finally { + try { + client.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close client ", e); + } + } + } - private String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - return null; - } catch (Exception e) { - return ""; - } - } + private String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + return null; + } catch (Exception e) { + return ""; + } + } - private List getBlobs(final String body) { - final List res = JsonPath.read(body, "$.hits.hits[*]._source.blob"); - return res; - } + private List getBlobs(final String body) { + final List res = JsonPath.read(body, "$.hits.hits[*]._source.blob"); + return res; + } - @Override - public boolean hasNext() { - return (buffer != null && !buffer.isEmpty()); - } + @Override + public boolean hasNext() { + return (buffer != null && !buffer.isEmpty()); + } - @Override - public String next() { - final String nextItem = buffer.remove(0); - if (buffer.isEmpty()) { + @Override + public String next() { + final String nextItem = buffer.remove(0); + if (buffer.isEmpty()) { - final String json_param = String.format(JSON_SCROLL, scrollId); - final String body = - getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); - try { - buffer = getBlobs(body); - } catch (Throwable e) { - logger.error("Error on get next page: body:" + body); - } - } - return nextItem; - } + final String json_param = String.format(JSON_SCROLL, scrollId); + final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + try { + buffer = getBlobs(body); + } catch (Throwable e) { + logger.error("Error on get next page: body:" + body); + } + } + return nextItem; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 0f086434b4..570fdef17c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -1,13 +1,12 @@ + package eu.dnetlib.doiboost.orcid; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; -import eu.dnetlib.doiboost.orcid.model.WorkData; -import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; + import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.hadoop.conf.Configuration; @@ -20,122 +19,128 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; +import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcid.model.WorkData; +import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; + public class ActivitiesDecompressor { - private static final int MAX_XML_WORKS_PARSED = -1; - private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; + private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; - public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) - throws Exception { - String uri = inputUri; - FileSystem fs = FileSystem.get(URI.create(uri), conf); - Path inputPath = new Path(uri); - CompressionCodecFactory factory = new CompressionCodecFactory(conf); - CompressionCodec codec = factory.getCodec(inputPath); - if (codec == null) { - System.err.println("No codec found for " + uri); - System.exit(1); - } - CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); - InputStream gzipInputStream = null; - try { - gzipInputStream = codec.createInputStream(fs.open(inputPath)); - parseTarActivities(fs, conf, gzipInputStream, outputPath); + public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarActivities(fs, conf, gzipInputStream, outputPath); - } finally { - Log.debug("Closing gzip stream"); - IOUtils.closeStream(gzipInputStream); - } - } + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } - private static void parseTarActivities( - FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { - int counter = 0; - int doiFound = 0; - int errorFromOrcidFound = 0; - int xmlParserErrorFound = 0; - try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { - TarArchiveEntry entry = null; + private static void parseTarActivities( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int doiFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(outputPath), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class))) { - while ((entry = tais.getNextTarEntry()) != null) { - String filename = entry.getName(); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); - try { - if (entry.isDirectory() || !filename.contains("works")) { + try { + if (entry.isDirectory() || !filename.contains("works")) { - } else { - Log.debug("XML work entry name: " + entry.getName()); - counter++; - BufferedReader br = - new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput - String line; - StringBuffer buffer = new StringBuffer(); - while ((line = br.readLine()) != null) { - buffer.append(line); - } - WorkData workData = XMLRecordParser.VTDParseWorkData(buffer.toString().getBytes()); - if (workData != null) { - if (workData.getErrorCode() != null) { - errorFromOrcidFound += 1; - Log.debug( - "error from Orcid with code " - + workData.getErrorCode() - + " for entry " - + entry.getName()); - continue; - } - if (workData.isDoiFound()) { - String jsonData = JsonWriter.create(workData); - Log.debug("oid: " + workData.getOid() + " data: " + jsonData); + } else { + Log.debug("XML work entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from + // tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + WorkData workData = XMLRecordParser.VTDParseWorkData(buffer.toString().getBytes()); + if (workData != null) { + if (workData.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log + .debug( + "error from Orcid with code " + + workData.getErrorCode() + + " for entry " + + entry.getName()); + continue; + } + if (workData.isDoiFound()) { + String jsonData = JsonWriter.create(workData); + Log.debug("oid: " + workData.getOid() + " data: " + jsonData); - final Text key = new Text(workData.getOid()); - final Text value = new Text(jsonData); + final Text key = new Text(workData.getOid()); + final Text value = new Text(jsonData); - try { - writer.append(key, value); - } catch (IOException e) { - Log.debug("Writing to sequence file: " + e.getMessage()); - Log.debug(e); - throw new RuntimeException(e); - } - doiFound += 1; - } + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } + doiFound += 1; + } - } else { - Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); - xmlParserErrorFound += 1; - } - } - } catch (Exception e) { - Log.warn( - "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); - Log.warn(e); - } + } else { + Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log + .warn( + "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); + Log.warn(e); + } - if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { - Log.info("Current xml works parsed: " + counter); - } + if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { + Log.info("Current xml works parsed: " + counter); + } - if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) { - break; - } - } - } - } catch (IOException e) { - Log.warn("Parsing work from gzip archive: " + e.getMessage()); - Log.warn(e); - throw new RuntimeException(e); - } - Log.info("Activities parse completed"); - Log.info("Total XML works parsed: " + counter); - Log.info("Total doi found: " + doiFound); - Log.info("Error from Orcid found: " + errorFromOrcidFound); - Log.info("Error parsing xml work found: " + xmlParserErrorFound); - } + if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing work from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Activities parse completed"); + Log.info("Total XML works parsed: " + counter); + Log.info("Total doi found: " + doiFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml work found: " + xmlParserErrorFound); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java index 7c42285de6..70528a8f60 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java @@ -1,48 +1,51 @@ + package eu.dnetlib.doiboost.orcid; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.mortbay.log.Log; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { - private String activitiesFileNameTarGz; - private String outputAuthorsDOIsPath; + private String activitiesFileNameTarGz; + private String outputAuthorsDOIsPath; - public static void main(String[] args) throws IOException, Exception { - OrcidAuthorsDOIsDataGen orcidAuthorsDOIsDataGen = new OrcidAuthorsDOIsDataGen(); - orcidAuthorsDOIsDataGen.loadArgs(args); - orcidAuthorsDOIsDataGen.generateAuthorsDOIsData(); - } + public static void main(String[] args) throws IOException, Exception { + OrcidAuthorsDOIsDataGen orcidAuthorsDOIsDataGen = new OrcidAuthorsDOIsDataGen(); + orcidAuthorsDOIsDataGen.loadArgs(args); + orcidAuthorsDOIsDataGen.generateAuthorsDOIsData(); + } - public void generateAuthorsDOIsData() throws Exception { - Configuration conf = initConfigurationObject(); - FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); - Path outputPath = - new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath)); - ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); - } + public void generateAuthorsDOIsData() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath)); + ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); + } - private void loadArgs(String[] args) throws IOException, Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - OrcidAuthorsDOIsDataGen.class.getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json"))); - parser.parseArgument(args); + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + OrcidAuthorsDOIsDataGen.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json"))); + parser.parseArgument(args); - hdfsServerUri = parser.get("hdfsServerUri"); - Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); - activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); - Log.info("Activities File Name: " + activitiesFileNameTarGz); - outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); - Log.info("Output Authors DOIs Data: " + outputAuthorsDOIsPath); - } + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); + Log.info("Default Path: " + hdfsOrcidDefaultPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); + Log.info("Output Authors DOIs Data: " + outputAuthorsDOIsPath); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 876356cc74..4f846bdf3c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -1,78 +1,81 @@ + package eu.dnetlib.doiboost.orcid; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.IOException; import java.net.URI; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.mortbay.log.Log; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + public class OrcidDSManager { - protected String hdfsServerUri; - protected String hdfsOrcidDefaultPath; - private String summariesFileNameTarGz; - private String outputAuthorsPath; + protected String hdfsServerUri; + protected String hdfsOrcidDefaultPath; + private String summariesFileNameTarGz; + private String outputAuthorsPath; - public static void main(String[] args) throws IOException, Exception { - OrcidDSManager orcidDSManager = new OrcidDSManager(); - orcidDSManager.loadArgs(args); - orcidDSManager.generateAuthors(); - } + public static void main(String[] args) throws IOException, Exception { + OrcidDSManager orcidDSManager = new OrcidDSManager(); + orcidDSManager.loadArgs(args); + orcidDSManager.generateAuthors(); + } - public void generateAuthors() throws Exception { - Configuration conf = initConfigurationObject(); - FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); - Path outputPath = - new Path( - hdfsServerUri - .concat(hdfsOrcidDefaultPath) - .concat(outputAuthorsPath) - .concat("authors.seq")); - SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); - } + public void generateAuthors() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); + Path outputPath = new Path( + hdfsServerUri + .concat(hdfsOrcidDefaultPath) + .concat(outputAuthorsPath) + .concat("authors.seq")); + SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); + } - protected Configuration initConfigurationObject() { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - return conf; - } + protected Configuration initConfigurationObject() { + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + return conf; + } - protected FileSystem initFileSystemObject(Configuration conf) { - // Get the filesystem - HDFS - FileSystem fs = null; - try { - fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return fs; - } + protected FileSystem initFileSystemObject(Configuration conf) { + // Get the filesystem - HDFS + FileSystem fs = null; + try { + fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return fs; + } - private void loadArgs(String[] args) throws IOException, Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - OrcidDSManager.class.getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); - parser.parseArgument(args); + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + OrcidDSManager.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + parser.parseArgument(args); - hdfsServerUri = parser.get("hdfsServerUri"); - Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); - summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); - Log.info("Summaries File Name: " + summariesFileNameTarGz); - outputAuthorsPath = parser.get("outputAuthorsPath"); - Log.info("Output Authors Data: " + outputAuthorsPath); - } + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); + Log.info("Default Path: " + hdfsOrcidDefaultPath); + summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + Log.info("Summaries File Name: " + summariesFileNameTarGz); + outputAuthorsPath = parser.get("outputAuthorsPath"); + Log.info("Output Authors Data: " + outputAuthorsPath); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index d85aacd31c..f0bbb5c327 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -1,13 +1,12 @@ + package eu.dnetlib.doiboost.orcid; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URI; + import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.hadoop.conf.Configuration; @@ -20,135 +19,140 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; +import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; + public class SummariesDecompressor { - private static final int MAX_XML_RECORDS_PARSED = -1; + private static final int MAX_XML_RECORDS_PARSED = -1; - public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) - throws Exception { - String uri = inputUri; - FileSystem fs = FileSystem.get(URI.create(uri), conf); - Path inputPath = new Path(uri); - CompressionCodecFactory factory = new CompressionCodecFactory(conf); - CompressionCodec codec = factory.getCodec(inputPath); - if (codec == null) { - System.err.println("No codec found for " + uri); - System.exit(1); - } - CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); - InputStream gzipInputStream = null; - try { - gzipInputStream = codec.createInputStream(fs.open(inputPath)); - parseTarSummaries(fs, conf, gzipInputStream, outputPath); + public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarSummaries(fs, conf, gzipInputStream, outputPath); - } finally { - Log.debug("Closing gzip stream"); - IOUtils.closeStream(gzipInputStream); - } - } + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } - private static void parseTarSummaries( - FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { - int counter = 0; - int nameFound = 0; - int surnameFound = 0; - int creditNameFound = 0; - int errorFromOrcidFound = 0; - int xmlParserErrorFound = 0; - try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { - TarArchiveEntry entry = null; + private static void parseTarSummaries( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int nameFound = 0; + int surnameFound = 0; + int creditNameFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(outputPath), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class))) { - while ((entry = tais.getNextTarEntry()) != null) { - String filename = entry.getName(); - try { - if (entry.isDirectory()) { - Log.debug("Directory entry name: " + entry.getName()); - } else { - Log.debug("XML record entry name: " + entry.getName()); - counter++; - BufferedReader br = - new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput - String line; - StringBuffer buffer = new StringBuffer(); - while ((line = br.readLine()) != null) { - buffer.append(line); - } - AuthorData authorData = - XMLRecordParser.VTDParseAuthorData(buffer.toString().getBytes()); - if (authorData != null) { - if (authorData.getErrorCode() != null) { - errorFromOrcidFound += 1; - Log.debug( - "error from Orcid with code " - + authorData.getErrorCode() - + " for oid " - + entry.getName()); - continue; - } - String jsonData = JsonWriter.create(authorData); - Log.debug("oid: " + authorData.getOid() + " data: " + jsonData); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + try { + if (entry.isDirectory()) { + Log.debug("Directory entry name: " + entry.getName()); + } else { + Log.debug("XML record entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from + // tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + AuthorData authorData = XMLRecordParser.VTDParseAuthorData(buffer.toString().getBytes()); + if (authorData != null) { + if (authorData.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log + .debug( + "error from Orcid with code " + + authorData.getErrorCode() + + " for oid " + + entry.getName()); + continue; + } + String jsonData = JsonWriter.create(authorData); + Log.debug("oid: " + authorData.getOid() + " data: " + jsonData); - final Text key = new Text(authorData.getOid()); - final Text value = new Text(jsonData); + final Text key = new Text(authorData.getOid()); + final Text value = new Text(jsonData); - try { - writer.append(key, value); - } catch (IOException e) { - Log.debug("Writing to sequence file: " + e.getMessage()); - Log.debug(e); - throw new RuntimeException(e); - } + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } - if (authorData.getName() != null) { - nameFound += 1; - } - if (authorData.getSurname() != null) { - surnameFound += 1; - } - if (authorData.getCreditName() != null) { - creditNameFound += 1; - } + if (authorData.getName() != null) { + nameFound += 1; + } + if (authorData.getSurname() != null) { + surnameFound += 1; + } + if (authorData.getCreditName() != null) { + creditNameFound += 1; + } - } else { - Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); - xmlParserErrorFound += 1; - } - } - } catch (Exception e) { - Log.warn( - "Parsing record from tar archive and xml record: " - + filename - + " " - + e.getMessage()); - Log.warn(e); - } + } else { + Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log + .warn( + "Parsing record from tar archive and xml record: " + + filename + + " " + + e.getMessage()); + Log.warn(e); + } - if ((counter % 100000) == 0) { - Log.info("Current xml records parsed: " + counter); - } + if ((counter % 100000) == 0) { + Log.info("Current xml records parsed: " + counter); + } - if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) { - break; - } - } - } - } catch (IOException e) { - Log.warn("Parsing record from gzip archive: " + e.getMessage()); - Log.warn(e); - throw new RuntimeException(e); - } - Log.info("Summaries parse completed"); - Log.info("Total XML records parsed: " + counter); - Log.info("Name found: " + nameFound); - Log.info("Surname found: " + surnameFound); - Log.info("Credit name found: " + creditNameFound); - Log.info("Error from Orcid found: " + errorFromOrcidFound); - Log.info("Error parsing xml record found: " + xmlParserErrorFound); - } + if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing record from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Summaries parse completed"); + Log.info("Total XML records parsed: " + counter); + Log.info("Name found: " + nameFound); + Log.info("Surname found: " + surnameFound); + Log.info("Credit name found: " + creditNameFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml record found: " + xmlParserErrorFound); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java index e0ab16570a..35676d5bab 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java @@ -1,26 +1,28 @@ + package eu.dnetlib.doiboost.orcid.json; import com.google.gson.JsonObject; + import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; public class JsonWriter { - public static String create(AuthorData authorData) { - JsonObject author = new JsonObject(); - author.addProperty("oid", authorData.getOid()); - author.addProperty("name", authorData.getName()); - author.addProperty("surname", authorData.getSurname()); - if (authorData.getCreditName() != null) { - author.addProperty("creditname", authorData.getCreditName()); - } - return author.toString(); - } + public static String create(AuthorData authorData) { + JsonObject author = new JsonObject(); + author.addProperty("oid", authorData.getOid()); + author.addProperty("name", authorData.getName()); + author.addProperty("surname", authorData.getSurname()); + if (authorData.getCreditName() != null) { + author.addProperty("creditname", authorData.getCreditName()); + } + return author.toString(); + } - public static String create(WorkData workData) { - JsonObject work = new JsonObject(); - work.addProperty("oid", workData.getOid()); - work.addProperty("doi", workData.getDoi()); - return work.toString(); - } + public static String create(WorkData workData) { + JsonObject work = new JsonObject(); + work.addProperty("oid", workData.getOid()); + work.addProperty("doi", workData.getDoi()); + return work.toString(); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index d413043349..1e1ef5c1de 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -1,50 +1,51 @@ + package eu.dnetlib.doiboost.orcid.model; public class AuthorData { - private String oid; - private String name; - private String surname; - private String creditName; - private String errorCode; + private String oid; + private String name; + private String surname; + private String creditName; + private String errorCode; - public String getErrorCode() { - return errorCode; - } + public String getErrorCode() { + return errorCode; + } - public void setErrorCode(String errorCode) { - this.errorCode = errorCode; - } + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSurname() { - return surname; - } + public String getSurname() { + return surname; + } - public void setSurname(String surname) { - this.surname = surname; - } + public void setSurname(String surname) { + this.surname = surname; + } - public String getCreditName() { - return creditName; - } + public String getCreditName() { + return creditName; + } - public void setCreditName(String creditName) { - this.creditName = creditName; - } + public void setCreditName(String creditName) { + this.creditName = creditName; + } - public String getOid() { - return oid; - } + public String getOid() { + return oid; + } - public void setOid(String oid) { - this.oid = oid; - } + public void setOid(String oid) { + this.oid = oid; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java index 9ce843e9f7..edd565686c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/WorkData.java @@ -1,42 +1,43 @@ + package eu.dnetlib.doiboost.orcid.model; public class WorkData { - private String oid; - private String doi; - private boolean doiFound = false; + private String oid; + private String doi; + private boolean doiFound = false; - public boolean isDoiFound() { - return doiFound; - } + public boolean isDoiFound() { + return doiFound; + } - public void setDoiFound(boolean doiFound) { - this.doiFound = doiFound; - } + public void setDoiFound(boolean doiFound) { + this.doiFound = doiFound; + } - public String getOid() { - return oid; - } + public String getOid() { + return oid; + } - public void setOid(String oid) { - this.oid = oid; - } + public void setOid(String oid) { + this.oid = oid; + } - public String getDoi() { - return doi; - } + public String getDoi() { + return doi; + } - public void setDoi(String doi) { - this.doi = doi; - } + public void setDoi(String doi) { + this.doi = doi; + } - public String getErrorCode() { - return errorCode; - } + public String getErrorCode() { + return errorCode; + } - public void setErrorCode(String errorCode) { - this.errorCode = errorCode; - } + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } - private String errorCode; + private String errorCode; } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java index e5eecff9bb..2e43f4d3e6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParser.java @@ -1,5 +1,9 @@ + package eu.dnetlib.doiboost.orcid.xml; +import java.util.Arrays; +import java.util.List; + import com.ximpleware.AutoPilot; import com.ximpleware.EOFException; import com.ximpleware.EncodingException; @@ -7,117 +11,113 @@ import com.ximpleware.EntityException; import com.ximpleware.ParseException; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; + import eu.dnetlib.dhp.parser.utility.VtdException; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.model.WorkData; -import java.util.Arrays; -import java.util.List; public class XMLRecordParser { - private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; - private static final String NS_COMMON = "common"; - private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; - private static final String NS_PERSON = "person"; - private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; - private static final String NS_DETAILS = "personal-details"; - private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; - private static final String NS_OTHER = "other-name"; - private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; - private static final String NS_RECORD = "record"; - private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; + private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common"; + private static final String NS_COMMON = "common"; + private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person"; + private static final String NS_PERSON = "person"; + private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details"; + private static final String NS_DETAILS = "personal-details"; + private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name"; + private static final String NS_OTHER = "other-name"; + private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record"; + private static final String NS_RECORD = "record"; + private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error"; - private static final String NS_WORK = "work"; - private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; + private static final String NS_WORK = "work"; + private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; - private static final String NS_ERROR = "error"; + private static final String NS_ERROR = "error"; - public static AuthorData VTDParseAuthorData(byte[] bytes) - throws VtdException, EncodingException, EOFException, EntityException, ParseException { - final VTDGen vg = new VTDGen(); - vg.setDoc(bytes); - vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); - ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); - ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL); - ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL); - ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); - ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); - ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + public static AuthorData VTDParseAuthorData(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL); + ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL); + ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL); + ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); - AuthorData authorData = new AuthorData(); - final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); - if (!errors.isEmpty()) { - authorData.setErrorCode(errors.get(0)); - return authorData; - } + AuthorData authorData = new AuthorData(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + authorData.setErrorCode(errors.get(0)); + return authorData; + } - List recordNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//record:record", Arrays.asList("path")); - if (!recordNodes.isEmpty()) { - final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1); - authorData.setOid(oid); - } else { - return null; - } + List recordNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//record:record", Arrays.asList("path")); + if (!recordNodes.isEmpty()) { + final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1); + authorData.setOid(oid); + } else { + return null; + } - final List names = - VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names"); - if (!names.isEmpty()) { - authorData.setName(names.get(0)); - } + final List names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names"); + if (!names.isEmpty()) { + authorData.setName(names.get(0)); + } - final List surnames = - VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name"); - if (!surnames.isEmpty()) { - authorData.setSurname(surnames.get(0)); - } + final List surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name"); + if (!surnames.isEmpty()) { + authorData.setSurname(surnames.get(0)); + } - final List creditNames = - VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name"); - if (!creditNames.isEmpty()) { - authorData.setCreditName(creditNames.get(0)); - } - return authorData; - } + final List creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name"); + if (!creditNames.isEmpty()) { + authorData.setCreditName(creditNames.get(0)); + } + return authorData; + } - public static WorkData VTDParseWorkData(byte[] bytes) - throws VtdException, EncodingException, EOFException, EntityException, ParseException { - final VTDGen vg = new VTDGen(); - vg.setDoc(bytes); - vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); - ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); - ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); - ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); + public static WorkData VTDParseWorkData(byte[] bytes) + throws VtdException, EncodingException, EOFException, EntityException, ParseException { + final VTDGen vg = new VTDGen(); + vg.setDoc(bytes); + vg.parse(true); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); + ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL); + ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL); + ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL); - WorkData workData = new WorkData(); - final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); - if (!errors.isEmpty()) { - workData.setErrorCode(errors.get(0)); - return workData; - } + WorkData workData = new WorkData(); + final List errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code"); + if (!errors.isEmpty()) { + workData.setErrorCode(errors.get(0)); + return workData; + } - List workNodes = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path")); - if (!workNodes.isEmpty()) { - final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; - workData.setOid(oid); - } else { - return null; - } + List workNodes = VtdUtilityParser + .getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path")); + if (!workNodes.isEmpty()) { + final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1]; + workData.setOid(oid); + } else { + return null; + } - final List dois = - VtdUtilityParser.getTextValue( - ap, vn, "//common:external-id-type[text()=\"doi\"]/../common:external-id-value"); - if (!dois.isEmpty()) { - workData.setDoi(dois.get(0)); - workData.setDoiFound(true); - } - return workData; - } + final List dois = VtdUtilityParser + .getTextValue( + ap, vn, "//common:external-id-type[text()=\"doi\"]/../common:external-id-value"); + if (!dois.isEmpty()) { + workData.setDoi(dois.get(0)); + workData.setDoiFound(true); + } + return workData; + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java index a30c636646..19abeb2662 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java @@ -1,24 +1,28 @@ + package eu.dnetlib.doiboost; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; -import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; + +import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF; + public class DoiBoostTest { - final ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); - Logger logger = LoggerFactory.getLogger(DoiBoostTest.class); + final ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); + Logger logger = LoggerFactory.getLogger(DoiBoostTest.class); - @Test - public void test() throws Exception { + @Test + public void test() throws Exception { - // SparkDownloadContentFromCrossref.main(null); - // CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" - // ")); - SparkMapDumpIntoOAF.main( - "-m local[*] -s file:///data/doiboost/crossref_dump.seq -t /data/doiboost".split(" ")); - } + // SparkDownloadContentFromCrossref.main(null); + // CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" + // ")); + SparkMapDumpIntoOAF + .main( + "-m local[*] -s file:///data/doiboost/crossref_dump.seq -t /data/doiboost".split(" ")); + } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java index de41eacfa0..d5da4eec02 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/xml/XMLRecordParserTest.java @@ -1,57 +1,58 @@ + package eu.dnetlib.doiboost.orcid.xml; import static org.junit.jupiter.api.Assertions.assertNotNull; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcid.model.WorkData; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcid.model.WorkData; + public class XMLRecordParserTest { - @Test - public void testOrcidAuthorDataXMLParser() throws Exception { + @Test + public void testOrcidAuthorDataXMLParser() throws Exception { - String xml = - IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); + String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml")); - XMLRecordParser p = new XMLRecordParser(); + XMLRecordParser p = new XMLRecordParser(); - AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); - assertNotNull(authorData); - assertNotNull(authorData.getName()); - System.out.println("name: " + authorData.getName()); - assertNotNull(authorData.getSurname()); - System.out.println("surname: " + authorData.getSurname()); - } + AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); + assertNotNull(authorData); + assertNotNull(authorData.getName()); + System.out.println("name: " + authorData.getName()); + assertNotNull(authorData.getSurname()); + System.out.println("surname: " + authorData.getSurname()); + } - @Test - public void testOrcidXMLErrorRecordParser() throws Exception { + @Test + public void testOrcidXMLErrorRecordParser() throws Exception { - String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml")); + String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml")); - XMLRecordParser p = new XMLRecordParser(); + XMLRecordParser p = new XMLRecordParser(); - AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); - assertNotNull(authorData); - assertNotNull(authorData.getErrorCode()); - System.out.println("error: " + authorData.getErrorCode()); - } + AuthorData authorData = p.VTDParseAuthorData(xml.getBytes()); + assertNotNull(authorData); + assertNotNull(authorData.getErrorCode()); + System.out.println("error: " + authorData.getErrorCode()); + } - @Test - public void testOrcidWorkDataXMLParser() throws Exception { + @Test + public void testOrcidWorkDataXMLParser() throws Exception { - String xml = - IOUtils.toString( - this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml")); + String xml = IOUtils + .toString( + this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml")); - XMLRecordParser p = new XMLRecordParser(); + XMLRecordParser p = new XMLRecordParser(); - WorkData workData = p.VTDParseWorkData(xml.getBytes()); - assertNotNull(workData); - assertNotNull(workData.getOid()); - System.out.println("oid: " + workData.getOid()); - assertNotNull(workData.getDoi()); - System.out.println("doi: " + workData.getDoi()); - } + WorkData workData = p.VTDParseWorkData(xml.getBytes()); + assertNotNull(workData); + assertNotNull(workData.getOid()); + System.out.println("oid: " + workData.getOid()); + assertNotNull(workData.getDoi()); + System.out.println("doi: " + workData.getDoi()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java index 1046df6092..0f74c63430 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java @@ -1,11 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.hive; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -15,61 +14,68 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelSupport; + public class GraphHiveImporterJob { - private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); + private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - GraphHiveImporterJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GraphHiveImporterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String hiveDbName = parser.get("hiveDbName"); - log.info("hiveDbName: {}", hiveDbName); + String hiveDbName = parser.get("hiveDbName"); + log.info("hiveDbName: {}", hiveDbName); - String hiveMetastoreUris = parser.get("hiveMetastoreUris"); - log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + String hiveMetastoreUris = parser.get("hiveMetastoreUris"); + log.info("hiveMetastoreUris: {}", hiveMetastoreUris); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", hiveMetastoreUris); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", hiveMetastoreUris); - runWithSparkHiveSession( - conf, isSparkSessionManaged, spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName)); - } + runWithSparkHiveSession( + conf, isSparkSessionManaged, spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName)); + } - // protected for testing - private static void loadGraphAsHiveDB(SparkSession spark, String inputPath, String hiveDbName) { + // protected for testing + private static void loadGraphAsHiveDB(SparkSession spark, String inputPath, String hiveDbName) { - spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); - spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); + spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - // Read the input file and convert it into RDD of serializable object - ModelSupport.oafTypes.forEach( - (name, clazz) -> - spark - .createDataset( - sc.textFile(inputPath + "/" + name) - .map(s -> OBJECT_MAPPER.readValue(s, clazz)) - .rdd(), - Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name)); - } + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + // Read the input file and convert it into RDD of serializable object + ModelSupport.oafTypes + .forEach( + (name, clazz) -> spark + .createDataset( + sc + .textFile(inputPath + "/" + name) + .map(s -> OBJECT_MAPPER.readValue(s, clazz)) + .rdd(), + Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 69cd0001ff..e20d1eb796 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; @@ -10,6 +11,19 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; + import eu.dnetlib.dhp.oa.graph.raw.common.MigrationConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; @@ -29,440 +43,429 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.DocumentFactory; -import org.dom4j.DocumentHelper; -import org.dom4j.Node; public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; - - protected static final Qualifier MAIN_TITLE_QUALIFIER = - qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; - } - - public List processMdRecord(final String xml) { - try { - final Map nsContext = new HashMap<>(); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - - final Document doc = - DocumentHelper.parseText( - xml.replaceAll( - "http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = - keyValue( - createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), - doc.valueOf("//oaf:collectedFrom/@name")); - final KeyValue hostedBy = - StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : keyValue( - createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), - doc.valueOf("//oaf:hostedBy/@name")); - - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List oafs = new ArrayList<>(); - - switch (type.toLowerCase()) { - case "": - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; - } - - if (!oafs.isEmpty()) { - oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); - oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); - } - - return oafs; - } - - private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List res = new ArrayList<>(); - - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - - final String originalId = ((Node) o).getText(); - - if (StringUtils.isNotBlank(originalId)) { - final String projectId = createOpenaireId(40, originalId, true); - - final Relation r1 = new Relation(); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("isProducedBy"); - r1.setSource(docId); - r1.setTarget(projectId); - r1.setCollectedfrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); - - final Relation r2 = new Relation(); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("produces"); - r2.setSource(projectId); - r2.setTarget(docId); - r2.setCollectedfrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - } - - return res; - } - - protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - - private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(prepareContexts(doc, info)); - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - } - - private List prepareContexts(final Document doc, final DataInfo info) { - final List list = new ArrayList<>(); - for (final Object o : doc.selectNodes("//oaf:concept")) { - final String cid = ((Node) o).valueOf("@id"); - if (StringUtils.isNotBlank(cid)) { - final Context c = new Context(); - c.setId(cid); - c.setDataInfo(Arrays.asList(info)); - list.add(c); - } - } - return list; - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); - } - } - return null; - } - - protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = - doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { - return null; - } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']"); - ; - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); - ; - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']"); - ; - final String harvestDate = n.valueOf("@harvestDate"); - ; - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - protected DataInfo prepareDataInfo(final Document doc) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { - return dataInfo( - false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } + protected final Map code2name; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; + } + + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + + final Document doc = DocumentHelper + .parseText( + xml + .replaceAll( + "http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue( + createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), + doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : keyValue( + createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), + doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected List createOafs( + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); + } + + return oafs; + } + + private List addProjectRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + + final String originalId = ((Node) o).getText(); + + if (StringUtils.isNotBlank(originalId)) { + final String projectId = createOpenaireId(40, originalId, true); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedfrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedfrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + } + + return res; + } + + protected abstract List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private void populateResultFields( + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(prepareContexts(doc, info)); + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); + } + + private List prepareContexts(final Document doc, final DataInfo info) { + final List list = new ArrayList<>(); + for (final Object o : doc.selectNodes("//oaf:concept")) { + final String cid = ((Node) o).valueOf("@id"); + if (StringUtils.isNotBlank(cid)) { + final Context c = new Context(); + c.setId(cid); + c.setDataInfo(Arrays.asList(info)); + list.add(c); + } + } + return list; + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); + } + } + return null; + } + + protected Qualifier prepareQualifier( + final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + ; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + ; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + ; + final String harvestDate = n.valueOf("@harvestDate"); + ; + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return dataInfo( + false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, final String xpath, final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java index 8029f84222..1aab78afe1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java @@ -1,12 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -18,66 +16,72 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + public class DispatchEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - ModelSupport.oafTypes - .values() - .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); - }); - } + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + ModelSupport.oafTypes + .values() + .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); + }); + } - private static void processEntity( - final SparkSession spark, - final Class clazz, - final String sourcePath, - final String targetPath) { - final String type = clazz.getSimpleName().toLowerCase(); + private static void processEntity( + final SparkSession spark, + final Class clazz, + final String sourcePath, + final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); - log.info("Processing entities ({}) in file: {}", type, sourcePath); + log.info("Processing entities ({}) in file: {}", type, sourcePath); - spark - .read() - .textFile(sourcePath) - .filter((FilterFunction) value -> isEntityType(value, type)) - .map( - (MapFunction) l -> StringUtils.substringAfter(l, "|"), - Encoders.STRING()) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .text(targetPath + "/" + type); - } + spark + .read() + .textFile(sourcePath) + .filter((FilterFunction) value -> isEntityType(value, type)) + .map( + (MapFunction) l -> StringUtils.substringAfter(l, "|"), + Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(targetPath + "/" + type); + } - private static boolean isEntityType(final String line, final String type) { - return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); - } + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 21288ad98c..ccc9f8a890 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -1,17 +1,13 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; @@ -24,172 +20,182 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class GenerateEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePaths = parser.get("sourcePaths"); - final String targetPath = parser.get("targetPath"); + final String sourcePaths = parser.get("sourcePaths"); + final String targetPath = parser.get("targetPath"); - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); - } + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, code2name, sourcePaths, targetPath); + }); + } - private static void generateEntities( - final SparkSession spark, - final Map code2name, - final String sourcePaths, - final String targetPath) { + private static void generateEntities( + final SparkSession spark, + final Map code2name, + final String sourcePaths, + final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final List existingSourcePaths = - Arrays.stream(sourcePaths.split(",")) - .filter(p -> exists(sc, p)) - .collect(Collectors.toList()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final List existingSourcePaths = Arrays + .stream(sourcePaths.split(",")) + .filter(p -> exists(sc, p)) + .collect(Collectors.toList()); - log.info("Generate entities from files:"); - existingSourcePaths.forEach(log::info); + log.info("Generate entities from files:"); + existingSourcePaths.forEach(log::info); - JavaRDD inputRdd = sc.emptyRDD(); + JavaRDD inputRdd = sc.emptyRDD(); - for (final String sp : existingSourcePaths) { - inputRdd = - inputRdd.union( - sc.sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .flatMap(list -> list.iterator())); - } + for (final String sp : existingSourcePaths) { + inputRdd = inputRdd + .union( + sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .flatMap(list -> list.iterator())); + } - inputRdd - .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) - .reduceByKey((o1, o2) -> merge(o1, o2)) - .map(Tuple2::_2) - .map( - oaf -> - oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) - .saveAsTextFile(targetPath, GzipCodec.class); - } + inputRdd + .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) + .reduceByKey((o1, o2) -> merge(o1, o2)) + .map(Tuple2::_2) + .map( + oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) + .saveAsTextFile(targetPath, GzipCodec.class); + } - private static Oaf merge(Oaf o1, Oaf o2) { - if (ModelSupport.isSubClass(o1, OafEntity.class)) { - ((OafEntity) o1).mergeFrom((OafEntity) o2); - } else if (ModelSupport.isSubClass(o1, Relation.class)) { - ((Relation) o1).mergeFrom((Relation) o2); - } else { - throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); - } - return o1; - } + private static Oaf merge(Oaf o1, Oaf o2) { + if (ModelSupport.isSubClass(o1, OafEntity.class)) { + ((OafEntity) o1).mergeFrom((OafEntity) o2); + } else if (ModelSupport.isSubClass(o1, Relation.class)) { + ((Relation) o1).mergeFrom((Relation) o2); + } else { + throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); + } + return o1; + } - private static List convertToListOaf( - final String id, final String s, final Map code2name) { - final String type = StringUtils.substringAfter(id, ":"); + private static List convertToListOaf( + final String id, final String s, final Map code2name) { + final String type = StringUtils.substringAfter(id, ":"); - switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); - } - } + switch (type.toLowerCase()) { + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); + } + } - private static Map loadClassNames( - final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + private static Map loadClassNames( + final String dbUrl, final String dbUser, final String dbPassword) throws IOException { - log.info("Loading vocabulary terms from db..."); + log.info("Loading vocabulary terms from db..."); - final Map map = new HashMap<>(); + final Map map = new HashMap<>(); - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - dbClient.processResults( - "select code, name from class", - rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); - } + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + dbClient + .processResults( + "select code, name from class", + rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } - log.info("Found " + map.size() + " terms."); + log.info("Found " + map.size() + " terms."); - return map; - } + return map; + } - private static Oaf convertFromJson(final String s, final Class clazz) { - try { - return OBJECT_MAPPER.readValue(s, clazz); - } catch (final Exception e) { - log.error("Error parsing object of class: " + clazz); - log.error(s); - throw new RuntimeException(e); - } - } + private static Oaf convertFromJson(final String s, final Class clazz) { + try { + return OBJECT_MAPPER.readValue(s, clazz); + } catch (final Exception e) { + log.error("Error parsing object of class: " + clazz); + log.error(s); + throw new RuntimeException(e); + } + } - private static boolean exists(final JavaSparkContext context, final String pathToFile) { - try { - final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); - final Path path = new Path(pathToFile); - return hdfs.exists(path); - } catch (final IOException e) { - throw new RuntimeException(e); - } - } + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java index 7667735cb9..9b99097ce3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java @@ -1,14 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Objects; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -20,113 +17,118 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class MergeClaimsApplication { - private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); + private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String rawGraphPath = parser.get("rawGraphPath"); - log.info("rawGraphPath: {}", rawGraphPath); + final String rawGraphPath = parser.get("rawGraphPath"); + log.info("rawGraphPath: {}", rawGraphPath); - final String claimsGraphPath = parser.get("claimsGraphPath"); - log.info("claimsGraphPath: {}", claimsGraphPath); + final String claimsGraphPath = parser.get("claimsGraphPath"); + log.info("claimsGraphPath: {}", claimsGraphPath); - final String outputRawGaphPath = parser.get("outputRawGaphPath"); - log.info("outputRawGaphPath: {}", outputRawGaphPath); + final String outputRawGaphPath = parser.get("outputRawGaphPath"); + log.info("outputRawGaphPath: {}", outputRawGaphPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class clazz = (Class) Class.forName(graphTableClassName); + Class clazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - String type = clazz.getSimpleName().toLowerCase(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + String type = clazz.getSimpleName().toLowerCase(); - String rawPath = rawGraphPath + "/" + type; - String claimPath = claimsGraphPath + "/" + type; - String outPath = outputRawGaphPath + "/" + type; + String rawPath = rawGraphPath + "/" + type; + String claimPath = claimsGraphPath + "/" + type; + String outPath = outputRawGaphPath + "/" + type; - removeOutputDir(spark, outPath); - mergeByType(spark, rawPath, claimPath, outPath, clazz); - }); - } + removeOutputDir(spark, outPath); + mergeByType(spark, rawPath, claimPath, outPath, clazz); + }); + } - private static void mergeByType( - SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { - Dataset> raw = - readFromPath(spark, rawPath, clazz) - .map( - (MapFunction>) - value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + private static void mergeByType( + SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { + Dataset> raw = readFromPath(spark, rawPath, clazz) + .map( + (MapFunction>) value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Dataset> claim = - jsc.broadcast(readFromPath(spark, claimPath, clazz)) - .getValue() - .map( - (MapFunction>) - value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Dataset> claim = jsc + .broadcast(readFromPath(spark, claimPath, clazz)) + .getValue() + .map( + (MapFunction>) value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - raw.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") - .map( - (MapFunction, Tuple2>, T>) - value -> { - Optional> opRaw = Optional.ofNullable(value._1()); - Optional> opClaim = Optional.ofNullable(value._2()); + raw + .joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") + .map( + (MapFunction, Tuple2>, T>) value -> { + Optional> opRaw = Optional.ofNullable(value._1()); + Optional> opClaim = Optional.ofNullable(value._2()); - return opRaw.isPresent() - ? opRaw.get()._2() - : opClaim.isPresent() ? opClaim.get()._2() : null; - }, - Encoders.bean(clazz)) - .filter(Objects::nonNull) - .map( - (MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), - Encoders.STRING()) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outPath); - } + return opRaw.isPresent() + ? opRaw.get()._2() + : opClaim.isPresent() ? opClaim.get()._2() : null; + }, + Encoders.bean(clazz)) + .filter(Objects::nonNull) + .map( + (MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), + Encoders.STRING()) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outPath); + } - private static Dataset readFromPath( - SparkSession spark, String path, Class clazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), - Encoders.bean(clazz)) - .filter((FilterFunction) value -> Objects.nonNull(ModelSupport.idFn().apply(value))); - } + private static Dataset readFromPath( + SparkSession spark, String path, Class clazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), + Encoders.bean(clazz)) + .filter((FilterFunction) value -> Objects.nonNull(ModelSupport.idFn().apply(value))); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index f048698c87..d8ed88544d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString; @@ -10,6 +11,23 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.io.Closeable; +import java.io.IOException; +import java.sql.Array; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; @@ -31,512 +49,499 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.Closeable; -import java.io.IOException; -import java.sql.Array; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.function.Consumer; -import java.util.function.Function; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { - - private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); - - private final DbClient dbClient; - - private final long lastUpdateTimestamp; - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateDbEntitiesApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); - - parser.parseArgument(args); - - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); - - final String hdfsPath = parser.get("hdfsPath"); - - final boolean processClaims = - parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); - - try (final MigrateDbEntitiesApplication smdbe = - new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { - if (processClaims) { - log.info("Processing claims..."); - smdbe.execute("queryClaims.sql", smdbe::processClaims); - } else { - log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); - - log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); - - log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - - log.info("Processing relations ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); - - log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); - } - log.info("All done."); - } - } - - protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST - super(); - this.dbClient = null; - this.lastUpdateTimestamp = new Date().getTime(); - } - - public MigrateDbEntitiesApplication( - final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) - throws Exception { - super(hdfsPath); - this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); - this.lastUpdateTimestamp = new Date().getTime(); - } - - public void execute(final String sqlFile, final Function> producer) - throws Exception { - final String sql = - IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); - - final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); - - dbClient.processResults(sql, consumer); - } - - public List processDatasource(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Datasource ds = new Datasource(); - - ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); - ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); - ds.setCollectedfrom( - listKeyValues( - createOpenaireId(10, rs.getString("collectedfromid"), true), - rs.getString("collectedfromname"))); - ds.setPid(new ArrayList<>()); - ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); - ds.setDateoftransformation(null); // Value not returned by the SQL query - ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB - ds.setOaiprovenance(null); // Values not present in the DB - ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); - ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); - ds.setOfficialname(field(rs.getString("officialname"), info)); - ds.setEnglishname(field(rs.getString("englishname"), info)); - ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); - ds.setLogourl(field(rs.getString("logourl"), info)); - ds.setContactemail(field(rs.getString("contactemail"), info)); - ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); - ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); - ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); - ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); - ds.setDescription(field(rs.getString("description"), info)); - ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); - ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); - ds.setOdpolicies(field(rs.getString("odpolicies"), info)); - ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); - ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); - ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); - ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); - ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); - ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); - ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); - ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); - ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); - ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); - ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); - ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); - ds.setVersioning(field(rs.getBoolean("versioning"), info)); - ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); - ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); - ds.setPidsystems(field(rs.getString("pidsystems"), info)); - ds.setCertificates(field(rs.getString("certificates"), info)); - ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array - ds.setJournal( - prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal - ds.setDataInfo(info); - ds.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(ds); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProject(final ResultSet rs) { - try { - - final DataInfo info = prepareDataInfo(rs); - - final Project p = new Project(); - - p.setId(createOpenaireId(40, rs.getString("projectid"), true)); - p.setOriginalId(Arrays.asList(rs.getString("projectid"))); - p.setCollectedfrom( - listKeyValues( - createOpenaireId(10, rs.getString("collectedfromid"), true), - rs.getString("collectedfromname"))); - p.setPid(new ArrayList<>()); - p.setDateofcollection(asString(rs.getDate("dateofcollection"))); - p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - p.setExtraInfo(new ArrayList<>()); // Values not present in the DB - p.setOaiprovenance(null); // Values not present in the DB - p.setWebsiteurl(field(rs.getString("websiteurl"), info)); - p.setCode(field(rs.getString("code"), info)); - p.setAcronym(field(rs.getString("acronym"), info)); - p.setTitle(field(rs.getString("title"), info)); - p.setStartdate(field(asString(rs.getDate("startdate")), info)); - p.setEnddate(field(asString(rs.getDate("enddate")), info)); - p.setCallidentifier(field(rs.getString("callidentifier"), info)); - p.setKeywords(field(rs.getString("keywords"), info)); - p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); - p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); - p.setOamandatepublications( - field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); - p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); - p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); - p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); - p.setOptional1(field(rs.getString("optional1"), info)); - p.setOptional2(field(rs.getString("optional2"), info)); - p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); - p.setContactfullname(field(rs.getString("contactfullname"), info)); - p.setContactfax(field(rs.getString("contactfax"), info)); - p.setContactphone(field(rs.getString("contactphone"), info)); - p.setContactemail(field(rs.getString("contactemail"), info)); - p.setSummary(field(rs.getString("summary"), info)); - p.setCurrency(field(rs.getString("currency"), info)); - p.setTotalcost(new Float(rs.getDouble("totalcost"))); - p.setFundedamount(new Float(rs.getDouble("fundedamount"))); - p.setDataInfo(info); - p.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(p); - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processOrganization(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Organization o = new Organization(); - - o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); - o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); - o.setCollectedfrom( - listKeyValues( - createOpenaireId(10, rs.getString("collectedfromid"), true), - rs.getString("collectedfromname"))); - o.setPid(new ArrayList<>()); - o.setDateofcollection(asString(rs.getDate("dateofcollection"))); - o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - o.setExtraInfo(new ArrayList<>()); // Values not present in the DB - o.setOaiprovenance(null); // Values not present in the DB - o.setLegalshortname(field(rs.getString("legalshortname"), info)); - o.setLegalname(field(rs.getString("legalname"), info)); - o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query - o.setWebsiteurl(field(rs.getString("websiteurl"), info)); - o.setLogourl(field(rs.getString("logourl"), info)); - o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); - o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); - o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); - o.setEcresearchorganization( - field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); - o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); - o.setEcinternationalorganizationeurinterests( - field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); - o.setEcinternationalorganization( - field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); - o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); - o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); - o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); - o.setCountry(prepareQualifierSplitting(rs.getString("country"))); - o.setDataInfo(info); - o.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(o); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processDatasourceOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("organization"), true); - final String dsId = createOpenaireId(10, rs.getString("datasource"), true); - final List collectedFrom = - listKeyValues( - createOpenaireId(10, rs.getString("collectedfromid"), true), - rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("datasourceOrganization"); - r1.setSubRelType("provision"); - r1.setRelClass("isProvidedBy"); - r1.setSource(dsId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("datasourceOrganization"); - r2.setSubRelType("provision"); - r2.setRelClass("provides"); - r2.setSource(orgId); - r2.setTarget(dsId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProjectOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); - final String projectId = createOpenaireId(40, rs.getString("project"), true); - final List collectedFrom = - listKeyValues( - createOpenaireId(10, rs.getString("collectedfromid"), true), - rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("projectOrganization"); - r1.setSubRelType("participation"); - r1.setRelClass("hasParticipant"); - r1.setSource(projectId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("projectOrganization"); - r2.setSubRelType("participation"); - r2.setRelClass("isParticipant"); - r2.setSource(orgId); - r2.setTarget(projectId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processClaims(final ResultSet rs) { - - final DataInfo info = - dataInfo( - false, - null, - false, - false, - qualifier( - "user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), - "0.9"); - - final List collectedFrom = - listKeyValues(createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); - - try { - - if (rs.getString("source_type").equals("context")) { - final Result r; - - if (rs.getString("target_type").equals("dataset")) { - r = new Dataset(); - r.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); - } else if (rs.getString("target_type").equals("software")) { - r = new Software(); - r.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); - } else if (rs.getString("target_type").equals("other")) { - r = new OtherResearchProduct(); - r.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); - } else { - r = new Publication(); - r.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); - } - r.setId(createOpenaireId(50, rs.getString("target_id"), false)); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setContext(prepareContext(rs.getString("source_id"), info)); - r.setDataInfo(info); - r.setCollectedfrom(collectedFrom); - - return Arrays.asList(r); - } else { - final String sourceId = - createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); - final String targetId = - createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); - - final Relation r1 = new Relation(); - final Relation r2 = new Relation(); - - if (rs.getString("source_type").equals("project")) { - r1.setCollectedfrom(collectedFrom); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("produces"); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("isProducedBy"); - } else { - r1.setCollectedfrom(collectedFrom); - r1.setRelType("resultResult"); - r1.setSubRelType("relationship"); - r1.setRelClass("isRelatedTo"); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType("resultResult"); - r2.setSubRelType("relationship"); - r2.setRelClass("isRelatedTo"); - } - - r1.setSource(sourceId); - r1.setTarget(targetId); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - r2.setSource(targetId); - r2.setTarget(sourceId); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - private List prepareContext(final String id, final DataInfo dataInfo) { - final Context context = new Context(); - context.setId(id); - context.setDataInfo(Arrays.asList(dataInfo)); - return Arrays.asList(context); - } - - private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { - final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); - final String inferenceprovenance = rs.getString("inferenceprovenance"); - final Boolean inferred = rs.getBoolean("inferred"); - final String trust = rs.getString("trust"); - return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - MigrationConstants.ENTITYREGISTRY_PROVENANCE_ACTION, - trust); - } - - private Qualifier prepareQualifierSplitting(final String s) { - if (StringUtils.isBlank(s)) { - return null; - } - final String[] arr = s.split("@@@"); - return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; - } - - private List> prepareListFields(final Array array, final DataInfo info) { - try { - return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>(); - } catch (final SQLException e) { - throw new RuntimeException("Invalid SQL array", e); - } - } - - private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { - if (StringUtils.isBlank(s)) { - return null; - } - final String[] parts = s.split("###"); - if (parts.length == 2) { - final String value = parts[0]; - final String[] arr = parts[1].split("@@@"); - if (arr.length == 4) { - return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); - } - } - return null; - } - - private List prepareListOfStructProps( - final Array array, final DataInfo dataInfo) throws SQLException { - final List res = new ArrayList<>(); - if (array != null) { - for (final String s : (String[]) array.getArray()) { - final StructuredProperty sp = prepareStructProp(s, dataInfo); - if (sp != null) { - res.add(sp); - } - } - } - - return res; - } - - private Journal prepareJournal(final String name, final String sj, final DataInfo info) { - if (StringUtils.isNotBlank(sj)) { - final String[] arr = sj.split("@@@"); - if (arr.length == 3) { - final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; - final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null; - ; - final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null; - ; - if (issn != null || eissn != null || lissn != null) { - return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); - } - } - } - return null; - } - - @Override - public void close() throws IOException { - super.close(); - dbClient.close(); - } + implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + private final long lastUpdateTimestamp; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateDbEntitiesApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); + + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, + dbPassword)) { + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); + + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + log.info("All done."); + } + } + + protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST + super(); + this.dbClient = null; + this.lastUpdateTimestamp = new Date().getTime(); + } + + public MigrateDbEntitiesApplication( + final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) + throws Exception { + super(hdfsPath); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.lastUpdateTimestamp = new Date().getTime(); + } + + public void execute(final String sqlFile, final Function> producer) + throws Exception { + final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); + + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + + dbClient.processResults(sql, consumer); + } + + public List processDatasource(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Datasource ds = new Datasource(); + + ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds + .setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + ds.setPid(new ArrayList<>()); + ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); + ds.setDateoftransformation(null); // Value not returned by the SQL query + ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB + ds.setOaiprovenance(null); // Values not present in the DB + ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); + ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); + ds.setOfficialname(field(rs.getString("officialname"), info)); + ds.setEnglishname(field(rs.getString("englishname"), info)); + ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); + ds.setLogourl(field(rs.getString("logourl"), info)); + ds.setContactemail(field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); + ds.setDescription(field(rs.getString("description"), info)); + ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); + ds.setOdpolicies(field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); + ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); + ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); + ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); + ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); + ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(field(rs.getString("pidsystems"), info)); + ds.setCertificates(field(rs.getString("certificates"), info)); + ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array + ds + .setJournal( + prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal + ds.setDataInfo(info); + ds.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(ds); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProject(final ResultSet rs) { + try { + + final DataInfo info = prepareDataInfo(rs); + + final Project p = new Project(); + + p.setId(createOpenaireId(40, rs.getString("projectid"), true)); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p + .setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + p.setPid(new ArrayList<>()); + p.setDateofcollection(asString(rs.getDate("dateofcollection"))); + p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + p.setExtraInfo(new ArrayList<>()); // Values not present in the DB + p.setOaiprovenance(null); // Values not present in the DB + p.setWebsiteurl(field(rs.getString("websiteurl"), info)); + p.setCode(field(rs.getString("code"), info)); + p.setAcronym(field(rs.getString("acronym"), info)); + p.setTitle(field(rs.getString("title"), info)); + p.setStartdate(field(asString(rs.getDate("startdate")), info)); + p.setEnddate(field(asString(rs.getDate("enddate")), info)); + p.setCallidentifier(field(rs.getString("callidentifier"), info)); + p.setKeywords(field(rs.getString("keywords"), info)); + p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p + .setOamandatepublications( + field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); + p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); + p.setOptional1(field(rs.getString("optional1"), info)); + p.setOptional2(field(rs.getString("optional2"), info)); + p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(field(rs.getString("contactfullname"), info)); + p.setContactfax(field(rs.getString("contactfax"), info)); + p.setContactphone(field(rs.getString("contactphone"), info)); + p.setContactemail(field(rs.getString("contactemail"), info)); + p.setSummary(field(rs.getString("summary"), info)); + p.setCurrency(field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + p.setDataInfo(info); + p.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(p); + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processOrganization(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Organization o = new Organization(); + + o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o + .setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + o.setPid(new ArrayList<>()); + o.setDateofcollection(asString(rs.getDate("dateofcollection"))); + o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + o.setExtraInfo(new ArrayList<>()); // Values not present in the DB + o.setOaiprovenance(null); // Values not present in the DB + o.setLegalshortname(field(rs.getString("legalshortname"), info)); + o.setLegalname(field(rs.getString("legalname"), info)); + o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query + o.setWebsiteurl(field(rs.getString("websiteurl"), info)); + o.setLogourl(field(rs.getString("logourl"), info)); + o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o + .setEcresearchorganization( + field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o + .setEcinternationalorganizationeurinterests( + field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); + o + .setEcinternationalorganization( + field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + o.setDataInfo(info); + o.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(o); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processDatasourceOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("organization"), true); + final String dsId = createOpenaireId(10, rs.getString("datasource"), true); + final List collectedFrom = listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("datasourceOrganization"); + r1.setSubRelType("provision"); + r1.setRelClass("isProvidedBy"); + r1.setSource(dsId); + r1.setTarget(orgId); + r1.setCollectedfrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("datasourceOrganization"); + r2.setSubRelType("provision"); + r2.setRelClass("provides"); + r2.setSource(orgId); + r2.setTarget(dsId); + r2.setCollectedfrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProjectOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); + final String projectId = createOpenaireId(40, rs.getString("project"), true); + final List collectedFrom = listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("projectOrganization"); + r1.setSubRelType("participation"); + r1.setRelClass("hasParticipant"); + r1.setSource(projectId); + r1.setTarget(orgId); + r1.setCollectedfrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("projectOrganization"); + r2.setSubRelType("participation"); + r2.setRelClass("isParticipant"); + r2.setSource(orgId); + r2.setTarget(projectId); + r2.setCollectedfrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processClaims(final ResultSet rs) { + + final DataInfo info = dataInfo( + false, + null, + false, + false, + qualifier( + "user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), + "0.9"); + + final List collectedFrom = listKeyValues( + createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + r.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + r.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + r.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); + } else { + r = new Publication(); + r.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); + } + r.setId(createOpenaireId(50, rs.getString("target_id"), false)); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + r.setCollectedfrom(collectedFrom); + + return Arrays.asList(r); + } else { + final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); + final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setCollectedfrom(collectedFrom); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setCollectedfrom(collectedFrom); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setCollectedfrom(collectedFrom); + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setCollectedfrom(collectedFrom); + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { + final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); + final String inferenceprovenance = rs.getString("inferenceprovenance"); + final Boolean inferred = rs.getBoolean("inferred"); + final String trust = rs.getString("trust"); + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + MigrationConstants.ENTITYREGISTRY_PROVENANCE_ACTION, + trust); + } + + private Qualifier prepareQualifierSplitting(final String s) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] arr = s.split("@@@"); + return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + } + + private List> prepareListFields(final Array array, final DataInfo info) { + try { + return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>(); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] parts = s.split("###"); + if (parts.length == 2) { + final String value = parts[0]; + final String[] arr = parts[1].split("@@@"); + if (arr.length == 4) { + return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); + } + } + return null; + } + + private List prepareListOfStructProps( + final Array array, final DataInfo dataInfo) throws SQLException { + final List res = new ArrayList<>(); + if (array != null) { + for (final String s : (String[]) array.getArray()) { + final StructuredProperty sp = prepareStructProp(s, dataInfo); + if (sp != null) { + res.add(sp); + } + } + } + + return res; + } + + private Journal prepareJournal(final String name, final String sj, final DataInfo info) { + if (StringUtils.isNotBlank(sj)) { + final String[] arr = sj.split("@@@"); + if (arr.length == 3) { + final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; + final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null; + ; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null; + ; + if (issn != null || eissn != null || lissn != null) { + return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); + } + } + } + return null; + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index 95f4477e8b..00c1dc4bb4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -1,70 +1,73 @@ + package eu.dnetlib.dhp.oa.graph.raw; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; import java.io.Closeable; import java.io.IOException; import java.util.Map; import java.util.Map.Entry; + import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; +import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; + public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication - implements Closeable { + implements Closeable { - private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); - private final MdstoreClient mdstoreClient; + private final MdstoreClient mdstoreClient; - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); - final String mongoBaseUrl = parser.get("mongoBaseUrl"); - final String mongoDb = parser.get("mongoDb"); + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); - final String mdFormat = parser.get("mdFormat"); - final String mdLayout = parser.get("mdLayout"); - final String mdInterpretation = parser.get("mdInterpretation"); + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); - final String hdfsPath = parser.get("hdfsPath"); + final String hdfsPath = parser.get("hdfsPath"); - try (MigrateMongoMdstoresApplication app = - new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { - app.execute(mdFormat, mdLayout, mdInterpretation); - } - } + try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, + mongoDb)) { + app.execute(mdFormat, mdLayout, mdInterpretation); + } + } - public MigrateMongoMdstoresApplication( - final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { - super(hdfsPath); - this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); - } + public MigrateMongoMdstoresApplication( + final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { + super(hdfsPath); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + } - public void execute(final String format, final String layout, final String interpretation) { - final Map colls = - mdstoreClient.validCollections(format, layout, interpretation); - log.info("Found " + colls.size() + " mdstores"); + public void execute(final String format, final String layout, final String interpretation) { + final Map colls = mdstoreClient.validCollections(format, layout, interpretation); + log.info("Found " + colls.size() + " mdstores"); - for (final Entry entry : colls.entrySet()) { - log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); - final String currentColl = entry.getValue(); + for (final Entry entry : colls.entrySet()) { + log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); + final String currentColl = entry.getValue(); - for (final String xml : mdstoreClient.listRecords(currentColl)) { - emit(xml, "native_" + format); - } - } - } + for (final String xml : mdstoreClient.listRecords(currentColl)) { + emit(xml, "native_" + format); + } + } + } - @Override - public void close() throws IOException { - super.close(); - mdstoreClient.close(); - } + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 2a40e18024..286656149f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,8 +1,18 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; + import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -14,254 +24,251 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.Node; public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final Map code2name) { - super(code2name); - } + public OafToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.getText()); - author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.getText()); + author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } + res.add(author); + } + return res; + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:description", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributor", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributor", info); + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:coverage", info); - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:coverage", info); + } - @Override - protected List prepareInstances( - final Document doc, - final DataInfo info, - final KeyValue collectedfrom, - final KeyValue hostedby) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:identifier")) { - final String url = ((Node) o).getText().trim(); - if (url.startsWith("http")) { - final Instance instance = new Instance(); - instance.setUrl(Arrays.asList(url)); - instance.setInstancetype( - prepareQualifier( - doc, - "//dr:CobjCategory", - "dnet:publication_resource", - "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright( - prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount( - field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - res.add(instance); - } - } - return res; - } + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:identifier")) { + final String url = ((Node) o).getText().trim(); + if (url.startsWith("http")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(url)); + instance + .setInstancetype( + prepareQualifier( + doc, + "//dr:CobjCategory", + "dnet:publication_resource", + "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance + .setAccessright( + prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance + .setProcessingchargeamount( + field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance + .setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + } + return res; + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:source", info); - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:source", info); + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // SOFTWARES + // SOFTWARES - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + // DATASETS + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - // OTHER PRODUCTS + // OTHER PRODUCTS - @Override - protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { - final String originalId = ((Node) o).getText(); + final String originalId = ((Node) o).getText(); - if (StringUtils.isNotBlank(originalId)) { + if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); + final String otherId = createOpenaireId(50, originalId, false); - final Relation r1 = new Relation(); - r1.setRelType("resultResult"); - r1.setSubRelType("publicationDataset"); - r1.setRelClass("isRelatedTo"); - r1.setSource(docId); - r1.setTarget(otherId); - r1.setCollectedfrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedfrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); - final Relation r2 = new Relation(); - r2.setRelType("resultResult"); - r2.setSubRelType("publicationDataset"); - r2.setRelClass("isRelatedTo"); - r2.setSource(otherId); - r2.setTarget(docId); - r2.setCollectedfrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - } - return res; - } + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedfrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + } + return res; + } - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 64755a6eb7..93b0eb29ce 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -1,9 +1,19 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; + import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -14,338 +24,337 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.Node; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { - public OdfToOafMapper(final Map code2name) { - super(code2name); - } + public OdfToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//datacite:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); - author.setRank(pos++); - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//datacite:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./datacite:creatorName")); + author.setName(n.valueOf("./datacite:givenName")); + author.setSurname(n.valueOf("./datacite:familyName")); + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } - private List preparePids(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { - res.add( - structuredProperty( - ((Node) o).getText(), - prepareQualifier( - (Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), - info)); - } - return res; - } + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + res + .add( + structuredProperty( + ((Node) o).getText(), + prepareQualifier( + (Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), + info)); + } + return res; + } - @Override - protected List prepareInstances( - final Document doc, - final DataInfo info, - final KeyValue collectedfrom, - final KeyValue hostedby) { + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { - final Instance instance = new Instance(); - instance.setUrl(new ArrayList<>()); - instance.setInstancetype( - prepareQualifier( - doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright( - prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + final Instance instance = new Instance(); + instance.setUrl(new ArrayList<>()); + instance + .setInstancetype( + prepareQualifier( + doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance + .setAccessright( + prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance + .setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - for (final Object o : - doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { - instance.getUrl().add(((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { - instance.getUrl().add(((Node) o).getText().trim()); - } - for (final Object o : - doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { - instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { - instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); - } - return Arrays.asList(instance); - } + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { + instance.getUrl().add(((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { + instance.getUrl().add(((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { + instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { + instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); + } + return Arrays.asList(instance); + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:date")) { - final String dateType = ((Node) o).valueOf("@dateType"); - if (StringUtils.isBlank(dateType) - && !dateType.equalsIgnoreCase("Accepted") - && !dateType.equalsIgnoreCase("Issued") - && !dateType.equalsIgnoreCase("Updated") - && !dateType.equalsIgnoreCase("Available")) { - res.add( - structuredProperty( - ((Node) o).getText(), - "UNKNOWN", - "UNKNOWN", - "dnet:dataCite_date", - "dnet:dataCite_date", - info)); - } - } - return res; - } + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//datacite:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) + && !dateType.equalsIgnoreCase("Accepted") + && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") + && !dateType.equalsIgnoreCase("Available")) { + res + .add( + structuredProperty( + ((Node) o).getText(), + "UNKNOWN", + "UNKNOWN", + "dnet:dataCite_date", + "dnet:dataCite_date", + info)); + } + } + return res; + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributorName", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributorName", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:format", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:publisher", info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:subject", info); + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", - info); - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", + info); + } - @Override - protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", - info); - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", + info); + } - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return prepareQualifier( + doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", - info); - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", + info); + } - // DATASETS + // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:geoLocation")) { - final GeoLocation loc = new GeoLocation(); - loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); - loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); - loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); - res.add(loc); - } - return res; - } + for (final Object o : doc.selectNodes("//datacite:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); + res.add(loc); + } + return res; + } - @Override - protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Updated']", info); - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Updated']", info); + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:version", info); - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:version", info); + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:size", info); - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:size", info); + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Issued']", info); - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Issued']", info); + } - @Override - protected List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : - doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { + for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { - final String originalId = ((Node) o).getText(); + final String originalId = ((Node) o).getText(); - if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); - final String type = ((Node) o).valueOf("@relationType"); + if (StringUtils.isNotBlank(originalId)) { + final String otherId = createOpenaireId(50, originalId, false); + final String type = ((Node) o).valueOf("@relationType"); - if (type.equals("IsSupplementTo")) { - res.add( - prepareOtherResultRel( - collectedFrom, - info, - lastUpdateTimestamp, - docId, - otherId, - "supplement", - "isSupplementTo")); - res.add( - prepareOtherResultRel( - collectedFrom, - info, - lastUpdateTimestamp, - otherId, - docId, - "supplement", - "isSupplementedBy")); - } else if (type.equals("IsPartOf")) { - res.add( - prepareOtherResultRel( - collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); - res.add( - prepareOtherResultRel( - collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); - } else { - } - } - } - return res; - } + if (type.equals("IsSupplementTo")) { + res + .add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + docId, + otherId, + "supplement", + "isSupplementTo")); + res + .add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + otherId, + docId, + "supplement", + "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res + .add( + prepareOtherResultRel( + collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); + res + .add( + prepareOtherResultRel( + collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); + } else { + } + } + } + return res; + } - private Relation prepareOtherResultRel( - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp, - final String source, - final String target, - final String subRelType, - final String relClass) { - final Relation r = new Relation(); - r.setRelType("resultResult"); - r.setSubRelType(subRelType); - r.setRelClass(relClass); - r.setSource(source); - r.setTarget(target); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - return r; - } + private Relation prepareOtherResultRel( + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, - "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", - "dnet:dataCite_resource", - "dnet:dataCite_resource"); - } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier( + doc, + "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", + "dnet:dataCite_resource", + "dnet:dataCite_resource"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index c7756be0d5..f7579c0a08 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -1,9 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.io.Closeable; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -12,72 +13,74 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.codehaus.jackson.map.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Oaf; + public class AbstractMigrationApplication implements Closeable { - private final AtomicInteger counter = new AtomicInteger(0); + private final AtomicInteger counter = new AtomicInteger(0); - private final Text key = new Text(); + private final Text key = new Text(); - private final Text value = new Text(); + private final Text value = new Text(); - private final SequenceFile.Writer writer; + private final SequenceFile.Writer writer; - private final ObjectMapper objectMapper = new ObjectMapper(); + private final ObjectMapper objectMapper = new ObjectMapper(); - private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); + private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); - protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST - this.writer = null; - } + protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST + this.writer = null; + } - public AbstractMigrationApplication(final String hdfsPath) throws Exception { + public AbstractMigrationApplication(final String hdfsPath) throws Exception { - log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); - this.writer = - SequenceFile.createWriter( - getConf(), - SequenceFile.Writer.file(new Path(hdfsPath)), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class)); - } + this.writer = SequenceFile + .createWriter( + getConf(), + SequenceFile.Writer.file(new Path(hdfsPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + } - private Configuration getConf() throws IOException { - final Configuration conf = new Configuration(); - /* - * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", - * org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", - * org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); - * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); - */ - return conf; - } + private Configuration getConf() throws IOException { + final Configuration conf = new Configuration(); + /* + * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", + * org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", + * org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); + * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); + */ + return conf; + } - protected void emit(final String s, final String type) { - try { - key.set(counter.getAndIncrement() + ":" + type); - value.set(s); - writer.append(key, value); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emit(final String s, final String type) { + try { + key.set(counter.getAndIncrement() + ":" + type); + value.set(s); + writer.append(key, value); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - protected void emitOaf(final Oaf oaf) { - try { - emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emitOaf(final Oaf oaf) { + try { + emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - public ObjectMapper getObjectMapper() { - return objectMapper; - } + public ObjectMapper getObjectMapper() { + return objectMapper; + } - @Override - public void close() throws IOException { - writer.hflush(); - writer.close(); - } + @Override + public void close() throws IOException { + writer.hflush(); + writer.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java index ca7c9fffb6..121df81316 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java @@ -1,61 +1,62 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; import java.io.Closeable; import java.io.IOException; import java.sql.*; import java.util.function.Consumer; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class DbClient implements Closeable { - private static final Log log = LogFactory.getLog(DbClient.class); + private static final Log log = LogFactory.getLog(DbClient.class); - private Connection connection; + private Connection connection; - public DbClient(final String address, final String login, final String password) { + public DbClient(final String address, final String login, final String password) { - try { - Class.forName("org.postgresql.Driver"); + try { + Class.forName("org.postgresql.Driver"); - this.connection = - StringUtils.isNoneBlank(login, password) - ? DriverManager.getConnection(address, login, password) - : DriverManager.getConnection(address); - this.connection.setAutoCommit(false); - } catch (final Exception e) { - log.error("Connection to postgresDB failed"); - throw new RuntimeException("Connection to postgresDB failed", e); - } - log.info("Opened database successfully"); - } + this.connection = StringUtils.isNoneBlank(login, password) + ? DriverManager.getConnection(address, login, password) + : DriverManager.getConnection(address); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error("Connection to postgresDB failed"); + throw new RuntimeException("Connection to postgresDB failed", e); + } + log.info("Opened database successfully"); + } - public void processResults(final String sql, final Consumer consumer) { + public void processResults(final String sql, final Consumer consumer) { - try (final Statement stmt = connection.createStatement()) { - stmt.setFetchSize(100); + try (final Statement stmt = connection.createStatement()) { + stmt.setFetchSize(100); - try (final ResultSet rs = stmt.executeQuery(sql)) { - while (rs.next()) { - consumer.accept(rs); - } - } catch (final SQLException e) { - log.error("Error executing sql query: " + sql, e); - throw new RuntimeException("Error executing sql query", e); - } - } catch (final SQLException e1) { - log.error("Error preparing sql statement", e1); - throw new RuntimeException("Error preparing sql statement", e1); - } - } + try (final ResultSet rs = stmt.executeQuery(sql)) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + log.error("Error executing sql query: " + sql, e); + throw new RuntimeException("Error executing sql query", e); + } + } catch (final SQLException e1) { + log.error("Error preparing sql statement", e1); + throw new RuntimeException("Error preparing sql statement", e1); + } + } - @Override - public void close() throws IOException { - try { - connection.close(); - } catch (final SQLException e) { - throw new RuntimeException(e); - } - } + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java index 1602c97427..a2177935ad 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java @@ -1,100 +1,102 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import com.google.common.collect.Iterables; -import com.mongodb.MongoClient; -import com.mongodb.MongoClientURI; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.stream.StreamSupport; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bson.Document; +import com.google.common.collect.Iterables; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + public class MdstoreClient implements Closeable { - private final MongoClient client; - private final MongoDatabase db; + private final MongoClient client; + private final MongoDatabase db; - private static final String COLL_METADATA = "metadata"; - private static final String COLL_METADATA_MANAGER = "metadataManager"; + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; - private static final Log log = LogFactory.getLog(MdstoreClient.class); + private static final Log log = LogFactory.getLog(MdstoreClient.class); - public MdstoreClient(final String baseUrl, final String dbName) { - this.client = new MongoClient(new MongoClientURI(baseUrl)); - this.db = getDb(client, dbName); - } + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } - public Map validCollections( - final String mdFormat, final String mdLayout, final String mdInterpretation) { + public Map validCollections( + final String mdFormat, final String mdLayout, final String mdInterpretation) { - final Map transactions = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { - final String mdId = entry.getString("mdId"); - final String currentId = entry.getString("currentId"); - if (StringUtils.isNoneBlank(mdId, currentId)) { - transactions.put(mdId, currentId); - } - } + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } - final Map res = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA, true).find()) { - if (entry.getString("format").equals(mdFormat) - && entry.getString("layout").equals(mdLayout) - && entry.getString("interpretation").equals(mdInterpretation) - && transactions.containsKey(entry.getString("mdId"))) { - res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); - } - } + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { + if (entry.getString("format").equals(mdFormat) + && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) + && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } - return res; - } + return res; + } - private MongoDatabase getDb(final MongoClient client, final String dbName) { - if (!Iterables.contains(client.listDatabaseNames(), dbName)) { - final String err = - String.format("Database '%s' not found in %s", dbName, client.getAddress()); - log.warn(err); - throw new RuntimeException(err); - } - return client.getDatabase(dbName); - } + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } - private MongoCollection getColl( - final MongoDatabase db, final String collName, final boolean abortIfMissing) { - if (!Iterables.contains(db.listCollectionNames(), collName)) { - final String err = - String.format( - String.format("Missing collection '%s' in database '%s'", collName, db.getName())); - log.warn(err); - if (abortIfMissing) { - throw new RuntimeException(err); - } else { - return null; - } - } - return db.getCollection(collName); - } + private MongoCollection getColl( + final MongoDatabase db, final String collName, final boolean abortIfMissing) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String + .format( + String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + if (abortIfMissing) { + throw new RuntimeException(err); + } else { + return null; + } + } + return db.getCollection(collName); + } - public Iterable listRecords(final String collName) { - final MongoCollection coll = getColl(db, collName, false); - return coll == null - ? new ArrayList<>() - : () -> - StreamSupport.stream(coll.find().spliterator(), false) - .filter(e -> e.containsKey("body")) - .map(e -> e.getString("body")) - .iterator(); - } + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null + ? new ArrayList<>() + : () -> StreamSupport + .stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } - @Override - public void close() throws IOException { - client.close(); - } + @Override + public void close() throws IOException { + client.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java index 4e0b2dbd37..15bff95650 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; @@ -6,26 +7,21 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; public class MigrationConstants { - public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = - qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = - qualifier( - "dataset", "dataset", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = - qualifier( - "software", "software", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = - qualifier( - "other", "other", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = - qualifier( - "sysimport:crosswalk:repository", "sysimport:crosswalk:repository", - "dnet:provenanceActions", "dnet:provenanceActions"); - public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = - qualifier( - "sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", - "dnet:provenanceActions", "dnet:provenanceActions"); + public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = qualifier( + "publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier( + "dataset", "dataset", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier( + "software", "software", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier( + "other", "other", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier( + "sysimport:crosswalk:repository", "sysimport:crosswalk:repository", + "dnet:provenanceActions", "dnet:provenanceActions"); + public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = qualifier( + "sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", + "dnet:provenanceActions", "dnet:provenanceActions"); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index b9788a05cd..9beed28371 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -1,215 +1,220 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; + public class OafMapperUtils { - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { - throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); - } + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); + } - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { - return null; - } + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } - public static List> listFields(final DataInfo info, final String... values) { - return Arrays.stream(values) - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final String... values) { + return Arrays + .stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static List> listFields(final DataInfo info, final List values) { - return values.stream() - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final List values) { + return values + .stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static Qualifier qualifier( - final String classid, - final String classname, - final String schemeid, - final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } - public static StructuredProperty structuredProperty( - final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); - } + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } - public static StructuredProperty structuredProperty( - final String value, final Qualifier qualifier, final DataInfo dataInfo) { - if (value == null) { - return null; - } - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); - return sp; - } + public static StructuredProperty structuredProperty( + final String value, final Qualifier qualifier, final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } - public static ExtraInfo extraInfo( - final String name, - final String value, - final String typology, - final String provenance, - final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } - public static OAIProvenance oaiIProvenance( - final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); - return p; - } + return p; + } - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { - if (StringUtils.isNotBlank(name) - || StringUtils.isNotBlank(issnPrinted) - || StringUtils.isNotBlank(issnOnline) - || StringUtils.isNotBlank(issnLinking)) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } else { - return null; - } - } + if (StringUtils.isNotBlank(name) + || StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } - public static DataInfo dataInfo( - final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } - public static String createOpenaireId( - final int prefix, final String originalId, final boolean to_md5) { - if (to_md5) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } else { - return String.format("%s|%s", prefix, originalId); - } - } + public static String createOpenaireId( + final int prefix, final String originalId, final boolean to_md5) { + if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } - public static String createOpenaireId( - final String type, final String originalId, final boolean to_md5) { - switch (type) { - case "datasource": - return createOpenaireId(10, originalId, to_md5); - case "organization": - return createOpenaireId(20, originalId, to_md5); - case "person": - return createOpenaireId(30, originalId, to_md5); - case "project": - return createOpenaireId(40, originalId, to_md5); - default: - return createOpenaireId(50, originalId, to_md5); - } - } + public static String createOpenaireId( + final String type, final String originalId, final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } - public static String asString(final Object o) { - return o == null ? "" : o.toString(); - } + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index 5317983b1f..8adcd565b7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -1,178 +1,183 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; + import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; -import java.nio.charset.Charset; -import java.text.Normalizer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.text.WordUtils; public class PacePerson { - private static final String UTF8 = "UTF-8"; - private List name = Lists.newArrayList(); - private List surname = Lists.newArrayList(); - private List fullname = Lists.newArrayList(); - private final String original; + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; - private static Set particles = null; + private static Set particles = null; - public static final String capitalize(final String s) { - return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); - } + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } - public static final String dotAbbreviations(final String s) { - return s.length() == 1 ? s + "." : s; - } + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } - public PacePerson(String s, final boolean aggressive) { - original = s; - s = Normalizer.normalize(s, Normalizer.Form.NFD); - s = s.replaceAll("\\(.+\\)", ""); - s = s.replaceAll("\\[.+\\]", ""); - s = s.replaceAll("\\{.+\\}", ""); - s = s.replaceAll("\\s+-\\s+", "-"); - s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); - s = s.replaceAll("\\d", " "); - s = s.replaceAll("\\n", " "); - s = s.replaceAll("\\.", " "); - s = s.replaceAll("\\s+", " "); + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); - if (aggressive) { - s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); - // s = s.replaceAll("[\\W&&[^,-]]", ""); - } + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } - if (s.contains(",")) { - final String[] arr = s.split(","); - if (arr.length == 1) { - fullname = splitTerms(arr[0]); - } else if (arr.length > 1) { - surname = splitTerms(arr[0]); - name = splitTerms(arr[1]); - fullname.addAll(surname); - fullname.addAll(name); - } - } else { - fullname = splitTerms(s); + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); - int lastInitialPosition = fullname.size(); - boolean hasSurnameInUpperCase = false; + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; - for (int i = 0; i < fullname.size(); i++) { - final String term = fullname.get(i); - if (term.length() == 1) { - lastInitialPosition = i; - } else if (term.equals(term.toUpperCase())) { - hasSurnameInUpperCase = true; - } - } + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } - if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini - name = fullname.subList(0, lastInitialPosition + 1); - surname = fullname.subList(lastInitialPosition + 1, fullname.size()); - } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI - for (final String term : fullname) { - if (term.length() > 1 && term.equals(term.toUpperCase())) { - surname.add(term); - } else { - name.add(term); - } - } - } - } - } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } - private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); + } - final List list = Lists.newArrayList(); - for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { - if (!particles.contains(part.toLowerCase())) { - list.add(part); - } - } - return list; - } + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } - public List getName() { - return name; - } + public List getName() { + return name; + } - public String getNameString() { - return Joiner.on(" ").join(getName()); - } + public String getNameString() { + return Joiner.on(" ").join(getName()); + } - public List getSurname() { - return surname; - } + public List getSurname() { + return surname; + } - public List getFullname() { - return fullname; - } + public List getFullname() { + return fullname; + } - public String getOriginal() { - return original; - } + public String getOriginal() { + return original; + } - public String hash() { - return Hashing.murmur3_128() - .hashString(getNormalisedFullname(), Charset.forName(UTF8)) - .toString(); - } + public String hash() { + return Hashing + .murmur3_128() + .hashString(getNormalisedFullname(), Charset.forName(UTF8)) + .toString(); + } - public String getNormalisedFirstName() { - return Joiner.on(" ").join(getCapitalFirstnames()); - } + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } - public String getNormalisedSurname() { - return Joiner.on(" ").join(getCapitalSurname()); - } + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } - public String getSurnameString() { - return Joiner.on(" ").join(getSurname()); - } + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } - public String getNormalisedFullname() { - return isAccurate() - ? getNormalisedSurname() + ", " + getNormalisedFirstName() - : Joiner.on(" ").join(fullname); - } + public String getNormalisedFullname() { + return isAccurate() + ? getNormalisedSurname() + ", " + getNormalisedFirstName() + : Joiner.on(" ").join(fullname); + } - public List getCapitalFirstnames() { - return Lists.newArrayList( - Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); - } + public List getCapitalFirstnames() { + return Lists + .newArrayList( + Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } - public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); - } + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } - public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); - } + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } - public boolean isAccurate() { - return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); - } + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java index 2787c61a94..bc40afbfd6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java @@ -1,12 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import com.mongodb.DBObject; -import com.mongodb.MongoClient; -import com.mongodb.QueryBuilder; -import com.mongodb.client.FindIterable; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.IOException; import java.net.URI; import java.util.ArrayList; @@ -15,6 +9,7 @@ import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -25,127 +20,134 @@ import org.apache.hadoop.io.Text; import org.bson.Document; import org.bson.conversions.Bson; +import com.mongodb.DBObject; +import com.mongodb.MongoClient; +import com.mongodb.QueryBuilder; +import com.mongodb.client.FindIterable; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** - * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS - * Mongo database contains information of each MDSTore in two collections: -metadata That contains - * info like: ID, format, layout, interpretation -metadataManager: that contains info : ID, - * mongoCollectionName from the metadata collection we filter the ids with Format, layout, and - * Interpretation from the metadataManager we get the current MONGO collection name which contains - * metadata XML see function getCurrentId - * - *

This Job will be called different times in base at the triple we want import, and generates - * for each triple a sequence file of XML + * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS Mongo database + * contains information of each MDSTore in two collections: -metadata That contains info like: ID, format, layout, + * interpretation -metadataManager: that contains info : ID, mongoCollectionName from the metadata collection we filter + * the ids with Format, layout, and Interpretation from the metadataManager we get the current MONGO collection name + * which contains metadata XML see function getCurrentId + *

+ * This Job will be called different times in base at the triple we want import, and generates for each triple a + * sequence file of XML */ public class ImportDataFromMongo { - /** - * It requires in input some parameters described on a file - * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json - * - *

- the name node - the paht where store HDFS File - the mongo host - the mongo port - the - * metadata format to import - the metadata layout to import - the metadata interpretation to - * import - the mongo database Name - * - *

This params are encoded into args - * - * @param args - * @throws Exception - */ - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - ImportDataFromMongo.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); - parser.parseArgument(args); - final int port = Integer.parseInt(parser.get("dbport")); - final String host = parser.get("dbhost"); + /** + * It requires in input some parameters described on a file + * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json + *

+ * - the name node - the paht where store HDFS File - the mongo host - the mongo port - the metadata format to + * import - the metadata layout to import - the metadata interpretation to import - the mongo database Name + *

+ * This params are encoded into args + * + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ImportDataFromMongo.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); + parser.parseArgument(args); + final int port = Integer.parseInt(parser.get("dbport")); + final String host = parser.get("dbhost"); - final String format = parser.get("format"); - final String layout = parser.get("layout"); - final String interpretation = parser.get("interpretation"); + final String format = parser.get("format"); + final String layout = parser.get("layout"); + final String interpretation = parser.get("interpretation"); - final String dbName = parser.get("dbName"); - final MongoClient client = new MongoClient(host, port); - MongoDatabase database = client.getDatabase(dbName); + final String dbName = parser.get("dbName"); + final MongoClient client = new MongoClient(host, port); + MongoDatabase database = client.getDatabase(dbName); - MongoCollection metadata = database.getCollection("metadata"); - MongoCollection metadataManager = database.getCollection("metadataManager"); - final DBObject query = - QueryBuilder.start("format") - .is(format) - .and("layout") - .is(layout) - .and("interpretation") - .is(interpretation) - .get(); - final List ids = new ArrayList<>(); - metadata - .find((Bson) query) - .forEach((Consumer) document -> ids.add(document.getString("mdId"))); - List databaseId = - ids.stream() - .map(it -> getCurrentId(it, metadataManager)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + MongoCollection metadata = database.getCollection("metadata"); + MongoCollection metadataManager = database.getCollection("metadataManager"); + final DBObject query = QueryBuilder + .start("format") + .is(format) + .and("layout") + .is(layout) + .and("interpretation") + .is(interpretation) + .get(); + final List ids = new ArrayList<>(); + metadata + .find((Bson) query) + .forEach((Consumer) document -> ids.add(document.getString("mdId"))); + List databaseId = ids + .stream() + .map(it -> getCurrentId(it, metadataManager)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); - final String hdfsuri = parser.get("namenode"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + final String hdfsuri = parser.get("namenode"); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(parser.get("targetPath")); + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(parser.get("targetPath")); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - databaseId.forEach( - id -> { - System.out.println("Reading :" + id); - MongoCollection collection = database.getCollection(id); - collection - .find() - .forEach( - (Consumer) - document -> { - key.set(counter.getAndIncrement()); - value.set(document.getString("body")); + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + databaseId + .forEach( + id -> { + System.out.println("Reading :" + id); + MongoCollection collection = database.getCollection(id); + collection + .find() + .forEach( + (Consumer) document -> { + key.set(counter.getAndIncrement()); + value.set(document.getString("body")); - if (counter.get() % 10000 == 0) { - System.out.println("Added " + counter.get()); - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - }); - } - } + if (counter.get() % 10000 == 0) { + System.out.println("Added " + counter.get()); + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + }); + } + } - /** - * Return the name of mongo collection giving an MdStore ID - * - * @param mdId The id of the MDStore - * @param metadataManager The collection metadataManager on mongo which contains this information - * @return - */ - private static String getCurrentId( - final String mdId, final MongoCollection metadataManager) { - FindIterable result = - metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); - final Document item = result.first(); - return item == null ? null : item.getString("currentId"); - } + /** + * Return the name of mongo collection giving an MdStore ID + * + * @param mdId The id of the MDStore + * @param metadataManager The collection metadataManager on mongo which contains this information + * @return + */ + private static String getCurrentId( + final String mdId, final MongoCollection metadataManager) { + FindIterable result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); + final Document item = result.first(); + return item == null ? null : item.getString("currentId"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java index 457f987ebf..4f015a9ad8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java @@ -1,11 +1,10 @@ + package eu.dnetlib.dhp.sx.graph; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.compress.GzipCodec; @@ -13,107 +12,115 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import net.minidev.json.JSONArray; + /** - * This Job extracts a typology of entity and stores it in a new RDD This job is called different - * times, for each file generated by the Job {@link ImportDataFromMongo} and store the new RDD in a - * path that should be under a folder: extractedEntities/entity/version1 - * - *

at the end of this process we will have : extractedEntities/dataset/version1 - * extractedEntities/dataset/version2 extractedEntities/dataset/... - * extractedEntities/publication/version1 extractedEntities/publication/version2 - * extractedEntities/publication/... extractedEntities/unknown/version1 - * extractedEntities/unknown/version2 extractedEntities/unknown/... - * extractedEntities/relation/version1 extractedEntities/relation/version2 + * This Job extracts a typology of entity and stores it in a new RDD This job is called different times, for each file + * generated by the Job {@link ImportDataFromMongo} and store the new RDD in a path that should be under a folder: + * extractedEntities/entity/version1 + *

+ * at the end of this process we will have : extractedEntities/dataset/version1 extractedEntities/dataset/version2 + * extractedEntities/dataset/... extractedEntities/publication/version1 extractedEntities/publication/version2 + * extractedEntities/publication/... extractedEntities/unknown/version1 extractedEntities/unknown/version2 + * extractedEntities/unknown/... extractedEntities/relation/version1 extractedEntities/relation/version2 * extractedEntities/relation/... */ public class SparkExtractEntitiesJob { - static final String IDJSONPATH = "$.id"; - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkExtractEntitiesJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractEntitiesJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String targetPath = parser.get("targetPath"); - final String tdir = parser.get("targetDir"); - final JavaRDD inputRDD = sc.textFile(inputPath); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkExtractEntitiesJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractEntitiesJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String tdir = parser.get("targetDir"); + final JavaRDD inputRDD = sc.textFile(inputPath); - List entities = - Arrays.stream(parser.get("entities").split(",")) - .map(String::trim) - .collect(Collectors.toList()); - if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) { - // Extract Dataset - inputRDD - .filter(SparkExtractEntitiesJob::isDataset) - .saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class); - } - if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) { - // Extract Unknown - inputRDD - .filter(SparkExtractEntitiesJob::isUnknown) - .saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class); - } + List entities = Arrays + .stream(parser.get("entities").split(",")) + .map(String::trim) + .collect(Collectors.toList()); + if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) { + // Extract Dataset + inputRDD + .filter(SparkExtractEntitiesJob::isDataset) + .saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class); + } + if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) { + // Extract Unknown + inputRDD + .filter(SparkExtractEntitiesJob::isUnknown) + .saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class); + } - if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) { - // Extract Relation - inputRDD - .filter(SparkExtractEntitiesJob::isRelation) - .saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class); - } - if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) { - // Extract Relation - inputRDD - .filter(SparkExtractEntitiesJob::isPublication) - .saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class); - } - } + if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) { + // Extract Relation + inputRDD + .filter(SparkExtractEntitiesJob::isRelation) + .saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class); + } + if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) { + // Extract Relation + inputRDD + .filter(SparkExtractEntitiesJob::isPublication) + .saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class); + } + } - public static boolean isDataset(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("60|"); - } + public static boolean isDataset(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("60|"); + } - public static boolean isPublication(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("50|"); - } + public static boolean isPublication(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("50|"); + } - public static boolean isUnknown(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("70|"); - } + public static boolean isUnknown(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("70|"); + } - public static boolean isRelation(final String json) { - final String source = getJPathString(SOURCEJSONPATH, json); - final String target = getJPathString(TARGETJSONPATH, json); - return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target); - } + public static boolean isRelation(final String json) { + final String source = getJPathString(SOURCEJSONPATH, json); + final String target = getJPathString(TARGETJSONPATH, json); + return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target); + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java index cd8375df54..f3d7fd40ff 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java @@ -1,7 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -10,70 +9,67 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; /** - * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is - * different from the identifier * associated by the aggregator, this means that some relation - * points to missing identifier To avoid this problem we store in the model the Id and the - * OriginalObJIdentifier This jobs extract this pair and creates a Similar relation that will be - * used in SparkMergeEntities + * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is different from the + * identifier * associated by the aggregator, this means that some relation points to missing identifier To avoid this + * problem we store in the model the Id and the OriginalObJIdentifier This jobs extract this pair and creates a Similar + * relation that will be used in SparkMergeEntities */ public class SparkSXGeneratePidSimlarity { - static final String IDJSONPATH = "$.id"; - static final String OBJIDPATH = "$.originalObjIdentifier"; + static final String IDJSONPATH = "$.id"; + static final String OBJIDPATH = "$.originalObjIdentifier"; - public static void generateDataFrame( - final SparkSession spark, - final JavaSparkContext sc, - final String inputPath, - final String targetPath) { + public static void generateDataFrame( + final SparkSession spark, + final JavaSparkContext sc, + final String inputPath, + final String targetPath) { - final JavaPairRDD datasetSimRel = - sc.textFile(inputPath + "/dataset/*") - .mapToPair( - (PairFunction) - k -> - new Tuple2<>( - DHPUtils.getJPathString(IDJSONPATH, k), - DHPUtils.getJPathString(OBJIDPATH, k))) - .filter( - t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); + final JavaPairRDD datasetSimRel = sc + .textFile(inputPath + "/dataset/*") + .mapToPair( + (PairFunction) k -> new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> !StringUtils + .substringAfter(t._1(), "|") + .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) + .distinct(); - final JavaPairRDD publicationSimRel = - sc.textFile(inputPath + "/publication/*") - .mapToPair( - (PairFunction) - k -> - new Tuple2<>( - DHPUtils.getJPathString(IDJSONPATH, k), - DHPUtils.getJPathString(OBJIDPATH, k))) - .filter( - t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); + final JavaPairRDD publicationSimRel = sc + .textFile(inputPath + "/publication/*") + .mapToPair( + (PairFunction) k -> new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> !StringUtils + .substringAfter(t._1(), "|") + .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) + .distinct(); - JavaRDD simRel = - datasetSimRel - .union(publicationSimRel) - .map( - s -> { - final DLIRelation r = new DLIRelation(); - r.setSource(s._1()); - r.setTarget(s._2()); - r.setRelType("similar"); - return r; - }); - spark - .createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)) - .distinct() - .write() - .mode(SaveMode.Overwrite) - .save(targetPath + "/pid_simRel"); - } + JavaRDD simRel = datasetSimRel + .union(publicationSimRel) + .map( + s -> { + final DLIRelation r = new DLIRelation(); + r.setSource(s._1()); + r.setTarget(s._2()); + r.setRelType("similar"); + return r; + }); + spark + .createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .save(targetPath + "/pid_simRel"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java index 3d1d9ec490..385ac4d1a2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java @@ -1,20 +1,11 @@ + package eu.dnetlib.dhp.sx.graph; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileStatus; @@ -31,228 +22,236 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; +import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; +import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; +import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; +import eu.dnetlib.dhp.utils.DHPUtils; +import net.minidev.json.JSONArray; import scala.Tuple2; /** - * This job is responsible of the creation of RAW Graph It is applied to the different entities - * generated from {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown - * Entities we group all the entities of the same type by their identifier, and then in the reduce - * phase we merge all the entities. Merge means: -merge all the metadata -merge the collected From - * values - * - *

In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all - * Relation and emit a key constructed by (source, relType, Target) and the relation itself Reduce: - * Merge all relations Looking at the javadoc of {@link SparkSXGeneratePidSimlarity} we take the - * dataset of pid relation and joining by source and target we replace the wrong identifier in the - * relation with the correct ones. At the end we replace the new Dataset of Relation + * This job is responsible of the creation of RAW Graph It is applied to the different entities generated from + * {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown Entities we group all the entities of the + * same type by their identifier, and then in the reduce phase we merge all the entities. Merge means: -merge all the + * metadata -merge the collected From values + *

+ * In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all Relation and emit a key + * constructed by (source, relType, Target) and the relation itself Reduce: Merge all relations Looking at the javadoc + * of {@link SparkSXGeneratePidSimlarity} we take the dataset of pid relation and joining by source and target we + * replace the wrong identifier in the relation with the correct ones. At the end we replace the new Dataset of Relation */ public class SparkScholexplorerCreateRawGraphJob { - static final String IDJSONPATH = "$.id"; - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; - static final String RELJSONPATH = "$.relType"; + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; + static final String RELJSONPATH = "$.relType"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerCreateRawGraphJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .config( - new SparkConf() - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) - .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String targetPath = parser.get("targetPath"); - final String entity = parser.get("entity"); - FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); - List subFolder = - Arrays.stream(fs.listStatus(new Path(inputPath))) - .filter(FileStatus::isDirectory) - .map(FileStatus::getPath) - .collect(Collectors.toList()); - List> inputRdd = new ArrayList<>(); - subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath()))); - JavaRDD union = sc.emptyRDD(); - for (JavaRDD item : inputRdd) { - union = union.union(item); - } - switch (entity) { - case "dataset": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "publication": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "unknown": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "relation": - SparkSXGeneratePidSimlarity.generateDataFrame( - spark, sc, inputPath.replace("/relation", ""), targetPath.replace("/relation", "")); - RDD rdd = - union - .mapToPair( - (PairFunction) - f -> { - final String source = getJPathString(SOURCEJSONPATH, f); - final String target = getJPathString(TARGETJSONPATH, f); - final String reltype = getJPathString(RELJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure( - DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>( - DHPUtils.md5( - String.format( - "%s::%s::%s", - source.toLowerCase(), - reltype.toLowerCase(), - target.toLowerCase())), - mapper.readValue(f, DLIRelation.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map(Tuple2::_2) - .rdd(); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkScholexplorerCreateRawGraphJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .config( + new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) + .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String entity = parser.get("entity"); + FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); + List subFolder = Arrays + .stream(fs.listStatus(new Path(inputPath))) + .filter(FileStatus::isDirectory) + .map(FileStatus::getPath) + .collect(Collectors.toList()); + List> inputRdd = new ArrayList<>(); + subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath()))); + JavaRDD union = sc.emptyRDD(); + for (JavaRDD item : inputRdd) { + union = union.union(item); + } + switch (entity) { + case "dataset": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "publication": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "unknown": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "relation": + SparkSXGeneratePidSimlarity + .generateDataFrame( + spark, sc, inputPath.replace("/relation", ""), targetPath.replace("/relation", "")); + RDD rdd = union + .mapToPair( + (PairFunction) f -> { + final String source = getJPathString(SOURCEJSONPATH, f); + final String target = getJPathString(TARGETJSONPATH, f); + final String reltype = getJPathString(RELJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper + .configure( + DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>( + DHPUtils + .md5( + String + .format( + "%s::%s::%s", + source.toLowerCase(), + reltype.toLowerCase(), + target.toLowerCase())), + mapper.readValue(f, DLIRelation.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map(Tuple2::_2) + .rdd(); - spark - .createDataset(rdd, Encoders.bean(DLIRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .save(targetPath); - Dataset rel_ds = spark.read().load(targetPath).as(Encoders.bean(Relation.class)); + spark + .createDataset(rdd, Encoders.bean(DLIRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .save(targetPath); + Dataset rel_ds = spark.read().load(targetPath).as(Encoders.bean(Relation.class)); - System.out.println("LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel"); - Dataset sim_ds = - spark - .read() - .load(targetPath.replace("/relation", "") + "/pid_simRel") - .as(Encoders.bean(Relation.class)); + System.out.println("LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel"); + Dataset sim_ds = spark + .read() + .load(targetPath.replace("/relation", "") + "/pid_simRel") + .as(Encoders.bean(Relation.class)); - Dataset ids = - sim_ds.map( - (MapFunction) - relation -> { - final String type = StringUtils.substringBefore(relation.getSource(), "|"); - relation.setTarget( - String.format( - "%s|%s", - type, StringUtils.substringAfter(relation.getTarget(), "::"))); - return relation; - }, - Encoders.bean(Relation.class)); + Dataset ids = sim_ds + .map( + (MapFunction) relation -> { + final String type = StringUtils.substringBefore(relation.getSource(), "|"); + relation + .setTarget( + String + .format( + "%s|%s", + type, StringUtils.substringAfter(relation.getTarget(), "::"))); + return relation; + }, + Encoders.bean(Relation.class)); - final Dataset firstJoin = - rel_ds - .joinWith(ids, ids.col("target").equalTo(rel_ds.col("source")), "left_outer") - .map( - (MapFunction, Relation>) - s -> { - if (s._2() != null) { - s._1().setSource(s._2().getSource()); - } - return s._1(); - }, - Encoders.bean(Relation.class)); + final Dataset firstJoin = rel_ds + .joinWith(ids, ids.col("target").equalTo(rel_ds.col("source")), "left_outer") + .map( + (MapFunction, Relation>) s -> { + if (s._2() != null) { + s._1().setSource(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); - Dataset secondJoin = - firstJoin - .joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map( - (MapFunction, Relation>) - s -> { - if (s._2() != null) { - s._1().setTarget(s._2().getSource()); - } - return s._1(); - }, - Encoders.bean(Relation.class)); - secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed"); + Dataset secondJoin = firstJoin + .joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")), "left_outer") + .map( + (MapFunction, Relation>) s -> { + if (s._2() != null) { + s._1().setTarget(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); + secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed"); - FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration()); + FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration()); - fileSystem.delete(new Path(targetPath), true); - fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath)); - } - } + fileSystem.delete(new Path(targetPath), true); + fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath)); + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java index e0b0710c96..97f1251f0a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; -import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @@ -13,56 +8,65 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; +import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; +import eu.dnetlib.scholexplorer.relation.RelationMapper; import scala.Tuple2; /** - * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of - * heterogeneous entities like Dataset, Relation, Publication and Unknown + * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of heterogeneous + * entities like Dataset, Relation, Publication and Unknown */ public class SparkScholexplorerGraphImporter { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerGraphImporter.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkScholexplorerGraphImporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); - RelationMapper relationMapper = RelationMapper.load(); + RelationMapper relationMapper = RelationMapper.load(); - sc.sequenceFile(inputPath, IntWritable.class, Text.class) - .map(Tuple2::_2) - .map(Text::toString) - .repartition(500) - .flatMap( - (FlatMapFunction) - record -> { - switch (parser.get("entity")) { - case "dataset": - final DatasetScholexplorerParser d = new DatasetScholexplorerParser(); - return d.parseObject(record, relationMapper).iterator(); - case "publication": - final PublicationScholexplorerParser p = new PublicationScholexplorerParser(); - return p.parseObject(record, relationMapper).iterator(); - default: - throw new IllegalArgumentException("wrong values of entities"); - } - }) - .map( - k -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(k); - }) - .saveAsTextFile(parser.get("targetPath"), GzipCodec.class); - } + sc + .sequenceFile(inputPath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map(Text::toString) + .repartition(500) + .flatMap( + (FlatMapFunction) record -> { + switch (parser.get("entity")) { + case "dataset": + final DatasetScholexplorerParser d = new DatasetScholexplorerParser(); + return d.parseObject(record, relationMapper).iterator(); + case "publication": + final PublicationScholexplorerParser p = new PublicationScholexplorerParser(); + return p.parseObject(record, relationMapper).iterator(); + default: + throw new IllegalArgumentException("wrong values of entities"); + } + }) + .map( + k -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(k); + }) + .saveAsTextFile(parser.get("targetPath"), GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java index 5e11c2a536..c97753fdc7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java @@ -1,5 +1,17 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import javax.xml.stream.XMLStreamReader; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; @@ -8,199 +20,195 @@ import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import javax.xml.stream.XMLStreamReader; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; public abstract class AbstractScholexplorerParser { - protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); - static final Pattern pattern = - Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); - private List datasetSubTypes = - Arrays.asList( - "dataset", - "software", - "film", - "sound", - "physicalobject", - "audiovisual", - "collection", - "other", - "study", - "metadata"); + protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); + static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); + private List datasetSubTypes = Arrays + .asList( + "dataset", + "software", + "film", + "sound", + "physicalobject", + "audiovisual", + "collection", + "other", + "study", + "metadata"); - public abstract List parseObject(final String record, final RelationMapper relMapper); + public abstract List parseObject(final String record, final RelationMapper relMapper); - protected Map getAttributes(final XMLStreamReader parser) { - final Map attributesMap = new HashMap<>(); - for (int i = 0; i < parser.getAttributeCount(); i++) { - attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); - } - return attributesMap; - } + protected Map getAttributes(final XMLStreamReader parser) { + final Map attributesMap = new HashMap<>(); + for (int i = 0; i < parser.getAttributeCount(); i++) { + attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); + } + return attributesMap; + } - protected List extractSubject(List subjects) { - final List subjectResult = new ArrayList<>(); - if (subjects != null && subjects.size() > 0) { - subjects.forEach( - subjectMap -> { - final StructuredProperty subject = new StructuredProperty(); - subject.setValue(subjectMap.getTextValue()); - final Qualifier schema = new Qualifier(); - schema.setClassid("dnet:subject"); - schema.setClassname("dnet:subject"); - schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); - schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); - subject.setQualifier(schema); - subjectResult.add(subject); - }); - } - return subjectResult; - } + protected List extractSubject(List subjects) { + final List subjectResult = new ArrayList<>(); + if (subjects != null && subjects.size() > 0) { + subjects + .forEach( + subjectMap -> { + final StructuredProperty subject = new StructuredProperty(); + subject.setValue(subjectMap.getTextValue()); + final Qualifier schema = new Qualifier(); + schema.setClassid("dnet:subject"); + schema.setClassname("dnet:subject"); + schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); + schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); + subject.setQualifier(schema); + subjectResult.add(subject); + }); + } + return subjectResult; + } - protected StructuredProperty extractIdentifier( - List identifierType, final String fieldName) { - final StructuredProperty pid = new StructuredProperty(); - if (identifierType != null && identifierType.size() > 0) { - final VtdUtilityParser.Node result = identifierType.get(0); - pid.setValue(result.getTextValue()); - final Qualifier pidType = new Qualifier(); - pidType.setClassname(result.getAttributes().get(fieldName)); - pidType.setClassid(result.getAttributes().get(fieldName)); - pidType.setSchemename("dnet:pid_types"); - pidType.setSchemeid("dnet:pid_types"); - pid.setQualifier(pidType); - return pid; - } - return null; - } + protected StructuredProperty extractIdentifier( + List identifierType, final String fieldName) { + final StructuredProperty pid = new StructuredProperty(); + if (identifierType != null && identifierType.size() > 0) { + final VtdUtilityParser.Node result = identifierType.get(0); + pid.setValue(result.getTextValue()); + final Qualifier pidType = new Qualifier(); + pidType.setClassname(result.getAttributes().get(fieldName)); + pidType.setClassid(result.getAttributes().get(fieldName)); + pidType.setSchemename("dnet:pid_types"); + pidType.setSchemeid("dnet:pid_types"); + pid.setQualifier(pidType); + return pid; + } + return null; + } - protected void inferPid(final StructuredProperty input) { - final Matcher matcher = pattern.matcher(input.getValue()); - if (matcher.find()) { - input.setValue(matcher.group()); - if (input.getQualifier() == null) { - input.setQualifier(new Qualifier()); - input.getQualifier().setSchemename("dnet:pid_types"); - input.getQualifier().setSchemeid("dnet:pid_types"); - } - input.getQualifier().setClassid("doi"); - input.getQualifier().setClassname("doi"); - } - } + protected void inferPid(final StructuredProperty input) { + final Matcher matcher = pattern.matcher(input.getValue()); + if (matcher.find()) { + input.setValue(matcher.group()); + if (input.getQualifier() == null) { + input.setQualifier(new Qualifier()); + input.getQualifier().setSchemename("dnet:pid_types"); + input.getQualifier().setSchemeid("dnet:pid_types"); + } + input.getQualifier().setClassid("doi"); + input.getQualifier().setClassname("doi"); + } + } - protected String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - if ("dnet".equalsIgnoreCase(pidType)) return type + StringUtils.substringAfter(pid, "::"); + protected String generateId(final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + if ("dnet".equalsIgnoreCase(pidType)) + return type + StringUtils.substringAfter(pid, "::"); - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } - protected DLIUnknown createUnknownObject( - final String pid, - final String pidType, - final KeyValue cf, - final DataInfo di, - final String dateOfCollection) { - final DLIUnknown uk = new DLIUnknown(); - uk.setId(generateId(pid, pidType, "unknown")); - ProvenaceInfo pi = new ProvenaceInfo(); - pi.setId(cf.getKey()); - pi.setName(cf.getValue()); - pi.setCompletionStatus("incomplete"); - uk.setDataInfo(di); - uk.setDlicollectedfrom(Collections.singletonList(pi)); - final StructuredProperty sourcePid = new StructuredProperty(); - sourcePid.setValue(pid); - final Qualifier pt = new Qualifier(); - pt.setClassname(pidType); - pt.setClassid(pidType); - pt.setSchemename("dnet:pid_types"); - pt.setSchemeid("dnet:pid_types"); - sourcePid.setQualifier(pt); - uk.setPid(Collections.singletonList(sourcePid)); - uk.setDateofcollection(dateOfCollection); - return uk; - } + protected DLIUnknown createUnknownObject( + final String pid, + final String pidType, + final KeyValue cf, + final DataInfo di, + final String dateOfCollection) { + final DLIUnknown uk = new DLIUnknown(); + uk.setId(generateId(pid, pidType, "unknown")); + ProvenaceInfo pi = new ProvenaceInfo(); + pi.setId(cf.getKey()); + pi.setName(cf.getValue()); + pi.setCompletionStatus("incomplete"); + uk.setDataInfo(di); + uk.setDlicollectedfrom(Collections.singletonList(pi)); + final StructuredProperty sourcePid = new StructuredProperty(); + sourcePid.setValue(pid); + final Qualifier pt = new Qualifier(); + pt.setClassname(pidType); + pt.setClassid(pidType); + pt.setSchemename("dnet:pid_types"); + pt.setSchemeid("dnet:pid_types"); + sourcePid.setQualifier(pt); + uk.setPid(Collections.singletonList(sourcePid)); + uk.setDateofcollection(dateOfCollection); + return uk; + } - protected void generateRelations( - RelationMapper relationMapper, - Result parsedObject, - List result, - DataInfo di, - String dateOfCollection, - List relatedIdentifiers) { - if (relatedIdentifiers != null) { - result.addAll( - relatedIdentifiers.stream() - .flatMap( - n -> { - final List rels = new ArrayList<>(); - DLIRelation r = new DLIRelation(); - r.setSource(parsedObject.getId()); - final String relatedPid = n.getTextValue(); - final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); - final String relatedType = - n.getAttributes().getOrDefault("entityType", "unknown"); - String relationSemantic = n.getAttributes().get("relationType"); - String inverseRelation; - final String targetId = generateId(relatedPid, relatedPidType, relatedType); - r.setDateOfCollection(dateOfCollection); - if (relationMapper.containsKey(relationSemantic.toLowerCase())) { - RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); - relationSemantic = relInfo.getOriginal(); - inverseRelation = relInfo.getInverse(); - } else { - relationSemantic = "Unknown"; - inverseRelation = "Unknown"; - } - r.setTarget(targetId); - r.setRelType(relationSemantic); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - r.setDataInfo(di); - rels.add(r); - r = new DLIRelation(); - r.setDataInfo(di); - r.setSource(targetId); - r.setTarget(parsedObject.getId()); - r.setRelType(inverseRelation); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - r.setDateOfCollection(dateOfCollection); - rels.add(r); - if ("unknown".equalsIgnoreCase(relatedType)) - result.add( - createUnknownObject( - relatedPid, - relatedPidType, - parsedObject.getCollectedfrom().get(0), - di, - dateOfCollection)); - return rels.stream(); - }) - .collect(Collectors.toList())); - } - } + protected void generateRelations( + RelationMapper relationMapper, + Result parsedObject, + List result, + DataInfo di, + String dateOfCollection, + List relatedIdentifiers) { + if (relatedIdentifiers != null) { + result + .addAll( + relatedIdentifiers + .stream() + .flatMap( + n -> { + final List rels = new ArrayList<>(); + DLIRelation r = new DLIRelation(); + r.setSource(parsedObject.getId()); + final String relatedPid = n.getTextValue(); + final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); + final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown"); + String relationSemantic = n.getAttributes().get("relationType"); + String inverseRelation; + final String targetId = generateId(relatedPid, relatedPidType, relatedType); + r.setDateOfCollection(dateOfCollection); + if (relationMapper.containsKey(relationSemantic.toLowerCase())) { + RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); + relationSemantic = relInfo.getOriginal(); + inverseRelation = relInfo.getInverse(); + } else { + relationSemantic = "Unknown"; + inverseRelation = "Unknown"; + } + r.setTarget(targetId); + r.setRelType(relationSemantic); + r.setRelClass("datacite"); + r.setCollectedfrom(parsedObject.getCollectedfrom()); + r.setDataInfo(di); + rels.add(r); + r = new DLIRelation(); + r.setDataInfo(di); + r.setSource(targetId); + r.setTarget(parsedObject.getId()); + r.setRelType(inverseRelation); + r.setRelClass("datacite"); + r.setCollectedfrom(parsedObject.getCollectedfrom()); + r.setDateOfCollection(dateOfCollection); + rels.add(r); + if ("unknown".equalsIgnoreCase(relatedType)) + result + .add( + createUnknownObject( + relatedPid, + relatedPidType, + parsedObject.getCollectedfrom().get(0), + di, + dateOfCollection)); + return rels.stream(); + }) + .collect(Collectors.toList())); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java index 07b711106b..f49163c87a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java @@ -1,270 +1,292 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; public class DatasetScholexplorerParser extends AbstractScholexplorerParser { - @Override - public List parseObject(String record, final RelationMapper relationMapper) { - try { - final DLIDataset parsedObject = new DLIDataset(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - final List result = new ArrayList<>(); - vg.parse(true); + @Override + public List parseObject(String record, final RelationMapper relationMapper) { + try { + final DLIDataset parsedObject = new DLIDataset(); + final VTDGen vg = new VTDGen(); + vg.setDoc(record.getBytes()); + final List result = new ArrayList<>(); + vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); - DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); - parsedObject.setDataInfo(di); + DataInfo di = new DataInfo(); + di.setTrust("0.9"); + di.setDeletedbyinference(false); + di.setInvisible(false); + parsedObject.setDataInfo(di); - parsedObject.setOriginalId( - Collections.singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + parsedObject + .setOriginalId( + Collections + .singletonList( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - parsedObject.setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String dateOfCollection = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); + parsedObject + .setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } + if (StringUtils.isNotBlank(resolvedDate)) { + StructuredProperty currentDate = new StructuredProperty(); + currentDate.setValue(resolvedDate); + final Qualifier dateQualifier = new Qualifier(); + dateQualifier.setClassname("resolvedDate"); + dateQualifier.setClassid("resolvedDate"); + dateQualifier.setSchemename("dnet::date"); + dateQualifier.setSchemeid("dnet::date"); + currentDate.setQualifier(dateQualifier); + parsedObject.setRelevantdate(Collections.singletonList(currentDate)); + } - final String completionStatus = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); - final String provisionMode = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + final String completionStatus = VtdUtilityParser + .getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); + final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - final String publisher = - VtdUtilityParser.getSingleValue( - ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); + final String publisher = VtdUtilityParser + .getSingleValue( + ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); - List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List collectedFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List resolvededFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - Field pf = new Field<>(); - pf.setValue(publisher); + Field pf = new Field<>(); + pf.setValue(publisher); - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + parsedObject.setPublisher(pf); + final List provenances = new ArrayList<>(); + if (collectedFromNodes != null && collectedFromNodes.size() > 0) { + collectedFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { + resolvededFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCollectedfrom( - parsedObject.getDlicollectedfrom().stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); - parsedObject.setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setDlicollectedfrom(provenances); + parsedObject + .setCollectedfrom( + parsedObject + .getDlicollectedfrom() + .stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); + parsedObject + .setCompletionStatus( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - final List identifierType = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='identifier']", - Collections.singletonList("identifierType")); + final List identifierType = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='identifier']", + Collections.singletonList("identifierType")); - StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); - if (currentPid == null) return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); + StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); + if (currentPid == null) + return null; + inferPid(currentPid); + parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = - generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); - parsedObject.setId(sourceId); + final String sourceId = generateId( + currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); + parsedObject.setId(sourceId); - List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); - if (descs != null && descs.size() > 0) - parsedObject.setDescription( - descs.stream() - .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) - .map( - it -> { - final Field d = new Field<>(); - d.setValue(it); - return d; - }) - .collect(Collectors.toList())); + List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); + if (descs != null && descs.size() > 0) + parsedObject + .setDescription( + descs + .stream() + .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) + .map( + it -> { + final Field d = new Field<>(); + d.setValue(it); + return d; + }) + .collect(Collectors.toList())); - final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays.asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); + final List relatedIdentifiers = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays + .asList( + "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); + final List hostedBy = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - if (hostedBy != null) { - parsedObject.setInstance( - hostedBy.stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } + if (hostedBy != null) { + parsedObject + .setInstance( + hostedBy + .stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl(Collections.singletonList(currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); + } - List subjects = - extractSubject( - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='subject']", - Collections.singletonList("subjectScheme"))); + List subjects = extractSubject( + VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='subject']", + Collections.singletonList("subjectScheme"))); - parsedObject.setSubject(subjects); + parsedObject.setSubject(subjects); - Qualifier q = new Qualifier(); - q.setClassname("dataset"); - q.setClassid("dataset"); - q.setSchemename("dataset"); - q.setSchemeid("dataset"); - parsedObject.setResulttype(q); + Qualifier q = new Qualifier(); + q.setClassname("dataset"); + q.setClassid("dataset"); + q.setSchemename("dataset"); + q.setSchemeid("dataset"); + parsedObject.setResulttype(q); - parsedObject.setCompletionStatus(completionStatus); + parsedObject.setCompletionStatus(completionStatus); - final List creators = - VtdUtilityParser.getTextValue( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); - if (creators != null && creators.size() > 0) { - parsedObject.setAuthor( - creators.stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); - } - final List titles = - VtdUtilityParser.getTextValue( - ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); - if (titles != null && titles.size() > 0) { - parsedObject.setTitle( - titles.stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - }) - .collect(Collectors.toList())); - } + final List creators = VtdUtilityParser + .getTextValue( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); + if (creators != null && creators.size() > 0) { + parsedObject + .setAuthor( + creators + .stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); + } + final List titles = VtdUtilityParser + .getTextValue( + ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); + if (titles != null && titles.size() > 0) { + parsedObject + .setTitle( + titles + .stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); + } - final List dates = - VtdUtilityParser.getTextValue( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); + final List dates = VtdUtilityParser + .getTextValue( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); - if (dates != null && dates.size() > 0) { - parsedObject.setRelevantdate( - dates.stream() - .map( - cd -> { - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - return date; - }) - .collect(Collectors.toList())); - } + if (dates != null && dates.size() > 0) { + parsedObject + .setRelevantdate( + dates + .stream() + .map( + cd -> { + StructuredProperty date = new StructuredProperty(); + date.setValue(cd); + final Qualifier dq = new Qualifier(); + dq.setClassname("date"); + dq.setClassid("date"); + dq.setSchemename("dnet::date"); + dq.setSchemeid("dnet::date"); + date.setQualifier(dq); + return date; + }) + .collect(Collectors.toList())); + } - result.add(parsedObject); - return result; - } catch (Throwable e) { - log.error("Error on parsing record " + record, e); - return null; - } - } + result.add(parsedObject); + return result; + } catch (Throwable e) { + log.error("Error on parsing record " + record, e); + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java index 2f7d484173..edbb444db7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java @@ -1,243 +1,259 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; public class PublicationScholexplorerParser extends AbstractScholexplorerParser { - @Override - public List parseObject(final String record, final RelationMapper relationMapper) { - try { - final List result = new ArrayList<>(); - final DLIPublication parsedObject = new DLIPublication(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - vg.parse(true); + @Override + public List parseObject(final String record, final RelationMapper relationMapper) { + try { + final List result = new ArrayList<>(); + final DLIPublication parsedObject = new DLIPublication(); + final VTDGen vg = new VTDGen(); + vg.setDoc(record.getBytes()); + vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); - final DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); + final DataInfo di = new DataInfo(); + di.setTrust("0.9"); + di.setDeletedbyinference(false); + di.setInvisible(false); - String dateOfCollection = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); + String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - parsedObject.setOriginalId( - Collections.singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + parsedObject + .setOriginalId( + Collections + .singletonList( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } + if (StringUtils.isNotBlank(resolvedDate)) { + StructuredProperty currentDate = new StructuredProperty(); + currentDate.setValue(resolvedDate); + final Qualifier dateQualifier = new Qualifier(); + dateQualifier.setClassname("resolvedDate"); + dateQualifier.setClassid("resolvedDate"); + dateQualifier.setSchemename("dnet::date"); + dateQualifier.setSchemeid("dnet::date"); + currentDate.setQualifier(dateQualifier); + parsedObject.setRelevantdate(Collections.singletonList(currentDate)); + } - final List pid = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); + final List pid = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); - StructuredProperty currentPid = extractIdentifier(pid, "type"); - if (currentPid == null) return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = - generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); - parsedObject.setId(sourceId); + StructuredProperty currentPid = extractIdentifier(pid, "type"); + if (currentPid == null) + return null; + inferPid(currentPid); + parsedObject.setPid(Collections.singletonList(currentPid)); + final String sourceId = generateId( + currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); + parsedObject.setId(sourceId); - parsedObject.setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + parsedObject + .setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String provisionMode = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List collectedFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List resolvededFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - final String publisher = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); - Field pf = new Field<>(); - pf.setValue(publisher); + final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); + Field pf = new Field<>(); + pf.setValue(publisher); - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + parsedObject.setPublisher(pf); + final List provenances = new ArrayList<>(); + if (collectedFromNodes != null && collectedFromNodes.size() > 0) { + collectedFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { + resolvededFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setDlicollectedfrom(provenances); + parsedObject + .setCompletionStatus( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - parsedObject.setCollectedfrom( - parsedObject.getDlicollectedfrom().stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); + parsedObject + .setCollectedfrom( + parsedObject + .getDlicollectedfrom() + .stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); - final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays.asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + final List relatedIdentifiers = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays + .asList( + "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); + final List hostedBy = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - if (hostedBy != null) { - parsedObject.setInstance( - hostedBy.stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } + if (hostedBy != null) { + parsedObject + .setInstance( + hostedBy + .stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl(Collections.singletonList(currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); + } - final List authorsNode = - VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); - if (authorsNode != null) - parsedObject.setAuthor( - authorsNode.stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); + final List authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); + if (authorsNode != null) + parsedObject + .setAuthor( + authorsNode + .stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); - final List titles = - VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); - if (titles != null) { - parsedObject.setTitle( - titles.stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - }) - .collect(Collectors.toList())); - } + final List titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); + if (titles != null) { + parsedObject + .setTitle( + titles + .stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); + } - Field description = new Field<>(); + Field description = new Field<>(); - description.setValue( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); + description + .setValue( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); - if (StringUtils.isNotBlank(description.getValue()) - && description.getValue().length() > 10000) { - description.setValue(description.getValue().substring(0, 10000)); - } + if (StringUtils.isNotBlank(description.getValue()) + && description.getValue().length() > 10000) { + description.setValue(description.getValue().substring(0, 10000)); + } - parsedObject.setDescription(Collections.singletonList(description)); + parsedObject.setDescription(Collections.singletonList(description)); - final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); + final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - parsedObject.setRelevantdate(Collections.singletonList(date)); + StructuredProperty date = new StructuredProperty(); + date.setValue(cd); + final Qualifier dq = new Qualifier(); + dq.setClassname("date"); + dq.setClassid("date"); + dq.setSchemename("dnet::date"); + dq.setSchemeid("dnet::date"); + date.setQualifier(dq); + parsedObject.setRelevantdate(Collections.singletonList(date)); - List subjects = - extractSubject( - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); - parsedObject.setSubject(subjects); + List subjects = extractSubject( + VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); + parsedObject.setSubject(subjects); - parsedObject.setDataInfo(di); + parsedObject.setDataInfo(di); - parsedObject.setSubject(subjects); - Qualifier q = new Qualifier(); - q.setClassname("publication"); - q.setClassid("publication"); - q.setSchemename("publication"); - q.setSchemeid("publication"); - parsedObject.setResulttype(q); - result.add(parsedObject); - return result; + parsedObject.setSubject(subjects); + Qualifier q = new Qualifier(); + q.setClassname("publication"); + q.setClassid("publication"); + q.setSchemename("publication"); + q.setSchemeid("publication"); + parsedObject.setResulttype(q); + result.add(parsedObject); + return result; - } catch (Throwable e) { - log.error("Input record: " + record); - log.error("Error on parsing record ", e); - return null; - } - } + } catch (Throwable e) { + log.error("Input record: " + record); + log.error("Error on parsing record ", e); + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java index 06d9d1e8ad..e951746703 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java @@ -1,10 +1,10 @@ + package eu.dnetlib.dhp.oa.graph; -import eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.spark.SparkConf; @@ -16,76 +16,82 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob; +import eu.dnetlib.dhp.schema.common.ModelSupport; + public class GraphHiveImporterJobTest { - private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class); + private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class); - public static final String JDBC_DERBY_TEMPLATE = - "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; + public static final String JDBC_DERBY_TEMPLATE = "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static String dbName; + private static String dbName; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(GraphHiveImporterJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(GraphHiveImporterJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - dbName = RandomStringUtils.randomAlphabetic(5); - log.info("using DB name {}", "test_" + dbName); + dbName = RandomStringUtils.randomAlphabetic(5); + log.info("using DB name {}", "test_" + dbName); - SparkConf conf = new SparkConf(); - conf.setAppName(GraphHiveImporterJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(GraphHiveImporterJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - conf.set( - "javax.jdo.option.ConnectionURL", - String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf + .set( + "javax.jdo.option.ConnectionURL", + String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); - spark = - SparkSession.builder() - .appName(GraphHiveImporterJobTest.class.getSimpleName()) - .config(conf) - .enableHiveSupport() - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(GraphHiveImporterJobTest.class.getSimpleName()) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testImportGraphAsHiveDB() throws Exception { + @Test + public void testImportGraphAsHiveDB() throws Exception { - GraphHiveImporterJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputPath", - getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), - "-hiveMetastoreUris", - "", - "-hiveDbName", - dbName - }); + GraphHiveImporterJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), + "-hiveMetastoreUris", + "", + "-hiveDbName", + dbName + }); - ModelSupport.oafTypes.forEach( - (name, clazz) -> { - long count = spark.read().table(dbName + "." + name).count(); - int expected = name.equals("relation") ? 100 : 10; + ModelSupport.oafTypes + .forEach( + (name, clazz) -> { + long count = spark.read().table(dbName + "." + name).count(); + int expected = name.equals("relation") ? 100 : 10; - Assertions.assertEquals( - expected, count, String.format("%s should be %s", name, expected)); - }); - } + Assertions + .assertEquals( + expected, count, String.format("%s should be %s", name, expected)); + }); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 89740718bb..951c97d9de 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -6,14 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; import java.io.IOException; import java.util.List; import java.util.Map; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -22,124 +19,131 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + @ExtendWith(MockitoExtension.class) public class MappersTest { - @Mock private Map code2name; + @Mock + private Map code2name; - @BeforeEach - public void setUp() throws Exception { - when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); - } + @BeforeEach + public void setUp() throws Exception { + when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); + } - @Test - void testPublication() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); + @Test + void testPublication() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(code2name).processMdRecord(xml); + final List list = new OafToOafMapper(code2name).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Publication); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(3, list.size()); + assertTrue(list.get(0) instanceof Publication); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); - final Publication p = (Publication) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + final Publication p = (Publication) list.get(0); + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); - assertValidId(p.getId()); - assertValidId(p.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); - assertTrue(p.getAuthor().size() > 0); - assertTrue(p.getSubject().size() > 0); - assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); - assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); - assertTrue(p.getInstance().size() > 0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + assertTrue(p.getSubject().size() > 0); + assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); + assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); + assertTrue(p.getInstance().size() > 0); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); - // System.out.println(new ObjectMapper().writeValueAsString(r1)); - // System.out.println(new ObjectMapper().writeValueAsString(r2)); - } + // System.out.println(new ObjectMapper().writeValueAsString(r1)); + // System.out.println(new ObjectMapper().writeValueAsString(r2)); + } - @Test - void testDataset() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); + @Test + void testDataset() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Dataset); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(3, list.size()); + assertTrue(list.get(0) instanceof Dataset); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); - final Dataset d = (Dataset) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + final Dataset d = (Dataset) list.get(0); + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); - assertValidId(d.getId()); - assertValidId(d.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); - assertTrue(d.getAuthor().size() > 0); - assertTrue(d.getSubject().size() > 0); - assertTrue(d.getInstance().size() > 0); - assertTrue(d.getContext().size() > 0); - assertTrue(d.getContext().get(0).getId().length() > 0); + assertValidId(d.getId()); + assertValidId(d.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertTrue(d.getAuthor().size() > 0); + assertTrue(d.getSubject().size() > 0); + assertTrue(d.getInstance().size() > 0); + assertTrue(d.getContext().size() > 0); + assertTrue(d.getContext().get(0).getId().length() > 0); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); - } + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); + } - @Test - void testSoftware() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); + @Test + void testSoftware() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Software); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Software); - final Software s = (Software) list.get(0); + final Software s = (Software) list.get(0); - assertValidId(s.getId()); - assertValidId(s.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); - assertTrue(s.getAuthor().size() > 0); - assertTrue(s.getSubject().size() > 0); - assertTrue(s.getInstance().size() > 0); - } + assertValidId(s.getId()); + assertValidId(s.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); + assertTrue(s.getAuthor().size() > 0); + assertTrue(s.getSubject().size() > 0); + assertTrue(s.getInstance().size() > 0); + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 0d3a273ecb..1bbe57ee83 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -1,17 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import java.io.IOException; import java.sql.Array; import java.sql.Date; @@ -19,6 +12,7 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; import java.util.Objects; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -28,316 +22,332 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + @ExtendWith(MockitoExtension.class) public class MigrateDbEntitiesApplicationTest { - private MigrateDbEntitiesApplication app; + private MigrateDbEntitiesApplication app; - @Mock private ResultSet rs; + @Mock + private ResultSet rs; - @BeforeEach - public void setUp() { - this.app = new MigrateDbEntitiesApplication(); - } + @BeforeEach + public void setUp() { + this.app = new MigrateDbEntitiesApplication(); + } - @Test - public void testProcessDatasource() throws Exception { - final List fields = prepareMocks("datasources_resultset_entry.json"); + @Test + public void testProcessDatasource() throws Exception { + final List fields = prepareMocks("datasources_resultset_entry.json"); - final List list = app.processDatasource(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processDatasource(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Datasource ds = (Datasource) list.get(0); - assertValidId(ds.getId()); - assertValidId(ds.getCollectedfrom().get(0).getKey()); - assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); - assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); - assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); - assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); - assertEquals( - ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Datasource ds = (Datasource) list.get(0); + assertValidId(ds.getId()); + assertValidId(ds.getCollectedfrom().get(0).getKey()); + assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); + assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); + assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); + assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); + assertEquals( + ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessProject() throws Exception { - final List fields = prepareMocks("projects_resultset_entry.json"); + @Test + public void testProcessProject() throws Exception { + final List fields = prepareMocks("projects_resultset_entry.json"); - final List list = app.processProject(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processProject(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Project p = (Project) list.get(0); - assertValidId(p.getId()); - assertValidId(p.getCollectedfrom().get(0).getKey()); - assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); - assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); - assertEquals( - p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Project p = (Project) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); + assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); + assertEquals( + p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessOrganization() throws Exception { - final List fields = prepareMocks("organizations_resultset_entry.json"); + @Test + public void testProcessOrganization() throws Exception { + final List fields = prepareMocks("organizations_resultset_entry.json"); - final List list = app.processOrganization(rs); + final List list = app.processOrganization(rs); - assertEquals(1, list.size()); + assertEquals(1, list.size()); - verifyMocks(fields); + verifyMocks(fields); - final Organization o = (Organization) list.get(0); - assertValidId(o.getId()); - assertValidId(o.getCollectedfrom().get(0).getKey()); - assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); - assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); - assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); - assertEquals( - o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); - assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); - assertEquals( - o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); - assertEquals( - o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Organization o = (Organization) list.get(0); + assertValidId(o.getId()); + assertValidId(o.getCollectedfrom().get(0).getKey()); + assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); + assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); + assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); + assertEquals( + o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); + assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); + assertEquals( + o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); + assertEquals( + o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessDatasourceOrganization() throws Exception { - final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); + @Test + public void testProcessDatasourceOrganization() throws Exception { + final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); - final List list = app.processDatasourceOrganization(rs); + final List list = app.processDatasourceOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } - @Test - public void testProcessProjectOrganization() throws Exception { - final List fields = prepareMocks("projectorganization_resultset_entry.json"); + @Test + public void testProcessProjectOrganization() throws Exception { + final List fields = prepareMocks("projectorganization_resultset_entry.json"); - final List list = app.processProjectOrganization(rs); + final List list = app.processProjectOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + } - @Test - public void testProcessClaims_context() throws Exception { - final List fields = prepareMocks("claimscontext_resultset_entry.json"); + @Test + public void testProcessClaims_context() throws Exception { + final List fields = prepareMocks("claimscontext_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Result); - final Result r = (Result) list.get(0); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Result); + final Result r = (Result) list.get(0); - verifyMocks(fields); + verifyMocks(fields); - assertValidId(r.getCollectedfrom().get(0).getKey()); - } + assertValidId(r.getCollectedfrom().get(0).getKey()); + } - @Test - public void testProcessClaims_rels() throws Exception { - final List fields = prepareMocks("claimsrel_resultset_entry.json"); + @Test + public void testProcessClaims_rels() throws Exception { + final List fields = prepareMocks("claimsrel_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - assertTrue(list.get(0) instanceof Relation); - assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(0) instanceof Relation); + assertTrue(list.get(1) instanceof Relation); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); - // System.out.println(new ObjectMapper().writeValueAsString(r1)); - // System.out.println(new ObjectMapper().writeValueAsString(r2)); - } + // System.out.println(new ObjectMapper().writeValueAsString(r1)); + // System.out.println(new ObjectMapper().writeValueAsString(r2)); + } - private List prepareMocks(final String jsonFile) throws IOException, SQLException { - final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); - final ObjectMapper mapper = new ObjectMapper(); - final List list = mapper.readValue(json, new TypeReference>() {}); + private List prepareMocks(final String jsonFile) throws IOException, SQLException { + final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); + final ObjectMapper mapper = new ObjectMapper(); + final List list = mapper.readValue(json, new TypeReference>() { + }); - for (final TypedField tf : list) { - if (tf.getValue() == null) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())).thenReturn(null); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())).thenReturn(0); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); - break; - case "array": - Mockito.when(rs.getArray(tf.getField())).thenReturn(null); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(null); - break; - } - } else { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())) - .thenReturn(Boolean.parseBoolean(tf.getValue().toString())); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())) - .thenReturn(Date.valueOf(tf.getValue().toString())); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())) - .thenReturn(new Integer(tf.getValue().toString())); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())) - .thenReturn(new Double(tf.getValue().toString())); - break; - case "array": - final Array arr = Mockito.mock(Array.class); - final String[] values = - ((List) tf.getValue()) - .stream() - .filter(Objects::nonNull) - .map(o -> o.toString()) - .toArray(String[]::new); + for (final TypedField tf : list) { + if (tf.getValue() == null) { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())).thenReturn(null); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())).thenReturn(0); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); + break; + case "array": + Mockito.when(rs.getArray(tf.getField())).thenReturn(null); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(null); + break; + } + } else { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito + .when(rs.getBoolean(tf.getField())) + .thenReturn(Boolean.parseBoolean(tf.getValue().toString())); + break; + case "date": + Mockito + .when(rs.getDate(tf.getField())) + .thenReturn(Date.valueOf(tf.getValue().toString())); + break; + case "int": + Mockito + .when(rs.getInt(tf.getField())) + .thenReturn(new Integer(tf.getValue().toString())); + break; + case "double": + Mockito + .when(rs.getDouble(tf.getField())) + .thenReturn(new Double(tf.getValue().toString())); + break; + case "array": + final Array arr = Mockito.mock(Array.class); + final String[] values = ((List) tf.getValue()) + .stream() + .filter(Objects::nonNull) + .map(o -> o.toString()) + .toArray(String[]::new); - Mockito.when(arr.getArray()).thenReturn(values); - Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); - break; - } - } - } + Mockito.when(arr.getArray()).thenReturn(values); + Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); + break; + } + } + } - return list; - } + return list; + } - private void verifyMocks(final List list) throws SQLException { - for (final TypedField tf : list) { + private void verifyMocks(final List list) throws SQLException { + for (final TypedField tf : list) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); - break; - case "date": - Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); - break; - case "int": - Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); - break; - case "double": - Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); - break; - case "array": - Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); - break; - case "string": - default: - Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); - break; - } - } - } + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); + break; + case "date": + Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); + break; + case "int": + Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); + break; + case "double": + Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); + break; + case "array": + Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); + break; + case "string": + default: + Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); + break; + } + } + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } - private String getValueAsString(final String name, final List fields) { - return fields.stream() - .filter(f -> f.getField().equals(name)) - .map(TypedField::getValue) - .filter(Objects::nonNull) - .map(o -> o.toString()) - .findFirst() - .get(); - } + private String getValueAsString(final String name, final List fields) { + return fields + .stream() + .filter(f -> f.getField().equals(name)) + .map(TypedField::getValue) + .filter(Objects::nonNull) + .map(o -> o.toString()) + .findFirst() + .get(); + } } class TypedField { - private String field; - private String type; - private Object value; + private String field; + private String type; + private Object value; - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(final String field) { - this.field = field; - } + public void setField(final String field) { + this.field = field; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(final String type) { - this.type = type; - } + public void setType(final String type) { + this.type = type; + } - public Object getValue() { - return value; - } + public Object getValue() { + return value; + } - public void setValue(final Object value) { - this.value = value; - } + public void setValue(final Object value) { + this.value = value; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java index f5ba4af55c..d418da594d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java @@ -1,35 +1,40 @@ + package eu.dnetlib.dhp.sx.graph; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; + import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.List; -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; public class ScholexplorerParserTest { - @Test - public void testDataciteParser() throws Exception { - String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); + @Test + public void testDataciteParser() throws Exception { + String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); - DatasetScholexplorerParser p = new DatasetScholexplorerParser(); - List oaves = p.parseObject(xml, RelationMapper.load()); + DatasetScholexplorerParser p = new DatasetScholexplorerParser(); + List oaves = p.parseObject(xml, RelationMapper.load()); - ObjectMapper m = new ObjectMapper(); - m.enable(SerializationFeature.INDENT_OUTPUT); + ObjectMapper m = new ObjectMapper(); + m.enable(SerializationFeature.INDENT_OUTPUT); - oaves.forEach( - oaf -> { - try { - System.out.println(m.writeValueAsString(oaf)); - System.out.println("----------------------------"); - } catch (JsonProcessingException e) { + oaves + .forEach( + oaf -> { + try { + System.out.println(m.writeValueAsString(oaf)); + System.out.println("----------------------------"); + } catch (JsonProcessingException e) { - } - }); - } + } + }); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java index 7f32de3188..ed3b6efdcc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java @@ -1,3 +1,5 @@ + package eu.dnetlib.dhp.sx.graph; -public class SparkScholexplorerGraphImporterTest {} +public class SparkScholexplorerGraphImporterTest { +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java index af63858031..348a2b030b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java @@ -1,3 +1,5 @@ + package eu.dnetlib.dhp.sx.graph; -public class SparkScholexplorerMergeEntitiesJobTest {} +public class SparkScholexplorerMergeEntitiesJobTest { +} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java index f9756c88b6..1b0cb4d055 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java @@ -1,46 +1,48 @@ + package eu.dnetlib.dhp.provision; +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.provision.scholix.summary.Typology; import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.lang3.StringUtils; public class ProvisionUtil { - public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; - public static final String TARGETJSONPATH = "$.target"; - public static final String SOURCEJSONPATH = "$.source"; + public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; + public static final String TARGETJSONPATH = "$.target"; + public static final String SOURCEJSONPATH = "$.source"; - // public static RelatedItemInfo getItemType(final String item, final String idPath) { - // String targetId = DHPUtils.getJPathString(idPath, item); - // switch (StringUtils.substringBefore(targetId, "|")) { - // case "50": - // return new RelatedItemInfo(null,0,1,0); - // case "60": - // return new RelatedItemInfo(null,1,0,0); - // case "70": - // return new RelatedItemInfo(null,0,0,1); - // default: - // throw new RuntimeException("Unknonw target ID"); - // - // } - // - // } + // public static RelatedItemInfo getItemType(final String item, final String idPath) { + // String targetId = DHPUtils.getJPathString(idPath, item); + // switch (StringUtils.substringBefore(targetId, "|")) { + // case "50": + // return new RelatedItemInfo(null,0,1,0); + // case "60": + // return new RelatedItemInfo(null,1,0,0); + // case "70": + // return new RelatedItemInfo(null,0,0,1); + // default: + // throw new RuntimeException("Unknonw target ID"); + // + // } + // + // } - public static Boolean isNotDeleted(final String item) { - return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); - } + public static Boolean isNotDeleted(final String item) { + return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); + } - public static Typology getItemTypeFromId(String id) { + public static Typology getItemTypeFromId(String id) { - switch (StringUtils.substringBefore(id, "|")) { - case "50": - return Typology.publication; - case "60": - return Typology.dataset; - case "70": - return Typology.unknown; - default: - throw new RuntimeException("Unknonw ID type"); - } - } + switch (StringUtils.substringBefore(id, "|")) { + case "50": + return Typology.publication; + case "60": + return Typology.dataset; + case "70": + return Typology.unknown; + default: + throw new RuntimeException("Unknonw ID type"); + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java index 7e322ce069..28826612d7 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.provision; import java.io.Serializable; @@ -5,53 +6,54 @@ import java.io.Serializable; /** This class models the information of related items */ public class RelatedItemInfo implements Serializable { - private String source; + private String source; - private long relatedDataset = 0; + private long relatedDataset = 0; - private long relatedPublication = 0; + private long relatedPublication = 0; - private long relatedUnknown = 0; + private long relatedUnknown = 0; - public RelatedItemInfo() {} + public RelatedItemInfo() { + } - public RelatedItemInfo( - String source, long relatedDataset, long relatedPublication, long relatedUnknown) { - this.source = source; - this.relatedDataset = relatedDataset; - this.relatedPublication = relatedPublication; - this.relatedUnknown = relatedUnknown; - } + public RelatedItemInfo( + String source, long relatedDataset, long relatedPublication, long relatedUnknown) { + this.source = source; + this.relatedDataset = relatedDataset; + this.relatedPublication = relatedPublication; + this.relatedUnknown = relatedUnknown; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(String source) { - this.source = source; - } + public void setSource(String source) { + this.source = source; + } - public long getRelatedDataset() { - return relatedDataset; - } + public long getRelatedDataset() { + return relatedDataset; + } - public void setRelatedDataset(long relatedDataset) { - this.relatedDataset = relatedDataset; - } + public void setRelatedDataset(long relatedDataset) { + this.relatedDataset = relatedDataset; + } - public long getRelatedPublication() { - return relatedPublication; - } + public long getRelatedPublication() { + return relatedPublication; + } - public void setRelatedPublication(long relatedPublication) { - this.relatedPublication = relatedPublication; - } + public void setRelatedPublication(long relatedPublication) { + this.relatedPublication = relatedPublication; + } - public long getRelatedUnknown() { - return relatedUnknown; - } + public long getRelatedUnknown() { + return relatedUnknown; + } - public void setRelatedUnknown(int relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } + public void setRelatedUnknown(int relatedUnknown) { + this.relatedUnknown = relatedUnknown; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java index 14ffb32e5c..df167f104f 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java @@ -1,32 +1,34 @@ + package eu.dnetlib.dhp.provision; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; import org.apache.spark.sql.*; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** - * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each - * item in relation which are the number of - Related Dataset - Related Publication - Related - * Unknown + * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each item in relation + * which are the number of - Related Dataset - Related Publication - Related Unknown */ public class SparkExtractRelationCount { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkExtractRelationCount.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkExtractRelationCount.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final String workingDirPath = parser.get("workingDirPath"); + final String workingDirPath = parser.get("workingDirPath"); - final String relationPath = parser.get("relationPath"); - DatasetJoiner.startJoin(spark, relationPath, workingDirPath + "/relatedItemCount"); - } + final String relationPath = parser.get("relationPath"); + DatasetJoiner.startJoin(spark, relationPath, workingDirPath + "/relatedItemCount"); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java index 327bad94e7..f9f3a58ce6 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java @@ -1,10 +1,6 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.*; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.schema.oaf.Relation; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -14,91 +10,100 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.*; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; public class SparkGenerateScholix { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkGenerateScholix.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); - parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.sql.shuffle.partitions", "4000"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - final SparkSession spark = - SparkSession.builder() - .config(conf) - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenerateScholix.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + parser.parseArgument(args); + SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "4000"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + final SparkSession spark = SparkSession + .builder() + .config(conf) + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - conf.registerKryoClasses( - new Class[] { - Scholix.class, ScholixCollectedFrom.class, ScholixEntityId.class, - ScholixIdentifier.class, ScholixRelationship.class, ScholixResource.class - }); + conf + .registerKryoClasses( + new Class[] { + Scholix.class, ScholixCollectedFrom.class, ScholixEntityId.class, + ScholixIdentifier.class, ScholixRelationship.class, ScholixResource.class + }); - final String graphPath = parser.get("graphPath"); - final String workingDirPath = parser.get("workingDirPath"); + final String graphPath = parser.get("graphPath"); + final String workingDirPath = parser.get("workingDirPath"); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final Dataset scholixSummary = - spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class)); - final Dataset rels = - spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); + final Dataset scholixSummary = spark + .read() + .load(workingDirPath + "/summary") + .as(Encoders.bean(ScholixSummary.class)); + final Dataset rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); - Dataset firstJoin = - scholixSummary - .joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) - .map( - (MapFunction, Scholix>) - f -> Scholix.generateScholixWithSource(f._1(), f._2()), - Encoders.bean(Scholix.class)); + Dataset firstJoin = scholixSummary + .joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) + .map( + (MapFunction, Scholix>) f -> Scholix + .generateScholixWithSource(f._1(), f._2()), + Encoders.bean(Scholix.class)); - firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath + "/scholix_1"); + firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath + "/scholix_1"); - Dataset scholix_final = - spark.read().load(workingDirPath + "/scholix_1").as(Encoders.bean(Scholix.class)); + Dataset scholix_final = spark + .read() + .load(workingDirPath + "/scholix_1") + .as(Encoders.bean(Scholix.class)); - scholixSummary - .map( - (MapFunction) ScholixResource::fromSummary, - Encoders.bean(ScholixResource.class)) - .repartition(1000) - .write() - .mode(SaveMode.Overwrite) - .save(workingDirPath + "/scholix_target"); + scholixSummary + .map( + (MapFunction) ScholixResource::fromSummary, + Encoders.bean(ScholixResource.class)) + .repartition(1000) + .write() + .mode(SaveMode.Overwrite) + .save(workingDirPath + "/scholix_target"); - Dataset target = - spark - .read() - .load(workingDirPath + "/scholix_target") - .as(Encoders.bean(ScholixResource.class)); + Dataset target = spark + .read() + .load(workingDirPath + "/scholix_target") + .as(Encoders.bean(ScholixResource.class)); - scholix_final - .joinWith( - target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner") - .map( - (MapFunction, Scholix>) - f -> { - final Scholix scholix = f._1(); - final ScholixResource scholixTarget = f._2(); - scholix.setTarget(scholixTarget); - scholix.generateIdentifier(); - scholix.generatelinkPublisher(); - return scholix; - }, - Encoders.kryo(Scholix.class)) - .javaRDD() - .map( - s -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(s); - }) - .saveAsTextFile(workingDirPath + "/scholix_json", GzipCodec.class); - } + scholix_final + .joinWith( + target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner") + .map( + (MapFunction, Scholix>) f -> { + final Scholix scholix = f._1(); + final ScholixResource scholixTarget = f._2(); + scholix.setTarget(scholixTarget); + scholix.generateIdentifier(); + scholix.generatelinkPublisher(); + return scholix; + }, + Encoders.kryo(Scholix.class)) + .javaRDD() + .map( + s -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(s); + }) + .saveAsTextFile(workingDirPath + "/scholix_json", GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java index a4a19e8339..04bde10996 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java @@ -1,104 +1,106 @@ + package eu.dnetlib.dhp.provision; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkGenerateSummary { - private static final String jsonIDPath = "$.id"; + private static final String jsonIDPath = "$.id"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkGenerateSummary.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenerateSummary.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final String graphPath = parser.get("graphPath"); - final String workingDirPath = parser.get("workingDirPath"); + final String graphPath = parser.get("graphPath"); + final String workingDirPath = parser.get("workingDirPath"); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - Dataset rInfo = - spark - .read() - .load(workingDirPath + "/relatedItemCount") - .as(Encoders.bean(RelatedItemInfo.class)); + Dataset rInfo = spark + .read() + .load(workingDirPath + "/relatedItemCount") + .as(Encoders.bean(RelatedItemInfo.class)); - Dataset entity = - spark.createDataset( - sc.textFile( - graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown") - .map( - s -> - ScholixSummary.fromJsonOAF( - ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), - s)) - .rdd(), - Encoders.bean(ScholixSummary.class)); + Dataset entity = spark + .createDataset( + sc + .textFile( + graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown") + .map( + s -> ScholixSummary + .fromJsonOAF( + ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), + s)) + .rdd(), + Encoders.bean(ScholixSummary.class)); - Dataset summaryComplete = - rInfo - .joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))) - .map( - (MapFunction, ScholixSummary>) - t -> { - ScholixSummary scholixSummary = t._2(); - RelatedItemInfo relatedItemInfo = t._1(); - scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - scholixSummary.setRelatedPublications( - relatedItemInfo.getRelatedPublication()); - scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - return scholixSummary; - }, - Encoders.bean(ScholixSummary.class)); + Dataset summaryComplete = rInfo + .joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))) + .map( + (MapFunction, ScholixSummary>) t -> { + ScholixSummary scholixSummary = t._2(); + RelatedItemInfo relatedItemInfo = t._1(); + scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + scholixSummary + .setRelatedPublications( + relatedItemInfo.getRelatedPublication()); + scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + return scholixSummary; + }, + Encoders.bean(ScholixSummary.class)); - summaryComplete.write().save(workingDirPath + "/summary"); + summaryComplete.write().save(workingDirPath + "/summary"); - // JavaPairRDD relationCount = - // sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); - // - // JavaPairRDD entities = - // sc.textFile(graphPath + "/publication") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) i -> new - // Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // .union( - // sc.textFile(graphPath + "/dataset") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) - // i -> - // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // ) - // .union( - // sc.textFile(graphPath + "/unknown") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) - // i -> - // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // ); - // entities.join(relationCount).map((Function>, - // String>) k -> - // ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), - // k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); - // - // - // ; + // JavaPairRDD relationCount = + // sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); + // + // JavaPairRDD entities = + // sc.textFile(graphPath + "/publication") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) i -> new + // Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // .union( + // sc.textFile(graphPath + "/dataset") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ) + // .union( + // sc.textFile(graphPath + "/unknown") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ); + // entities.join(relationCount).map((Function>, + // String>) k -> + // ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), + // k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); + // + // + // ; - } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java index 845284ccce..78d8730806 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java @@ -1,10 +1,9 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import java.util.HashMap; import java.util.Map; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -14,60 +13,64 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class SparkIndexCollectionOnES { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkIndexCollectionOnES.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/index_on_es.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkIndexCollectionOnES.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/index_on_es.json"))); + parser.parseArgument(args); - SparkConf conf = - new SparkConf() - .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) - .setMaster(parser.get("master")); + SparkConf conf = new SparkConf() + .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) + .setMaster(parser.get("master")); - conf.set("spark.sql.shuffle.partitions", "4000"); + conf.set("spark.sql.shuffle.partitions", "4000"); - final String sourcePath = parser.get("sourcePath"); - final String index = parser.get("index"); - final String idPath = parser.get("idPath"); - final String type = parser.get("type"); - final String indexHost = parser.get("esHost"); + final String sourcePath = parser.get("sourcePath"); + final String index = parser.get("index"); + final String idPath = parser.get("idPath"); + final String type = parser.get("type"); + final String indexHost = parser.get("esHost"); - final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD inputRdd; + JavaRDD inputRdd; - if ("summary".equalsIgnoreCase(type)) - inputRdd = - spark - .read() - .load(sourcePath) - .as(Encoders.bean(ScholixSummary.class)) - .map( - (MapFunction) - f -> { - final ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(f); - }, - Encoders.STRING()) - .javaRDD(); - else inputRdd = sc.textFile(sourcePath); + if ("summary".equalsIgnoreCase(type)) + inputRdd = spark + .read() + .load(sourcePath) + .as(Encoders.bean(ScholixSummary.class)) + .map( + (MapFunction) f -> { + final ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(f); + }, + Encoders.STRING()) + .javaRDD(); + else + inputRdd = sc.textFile(sourcePath); - Map esCfg = new HashMap<>(); - // esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); - esCfg.put("es.nodes", indexHost); - esCfg.put("es.mapping.id", idPath); - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); - JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); - } + Map esCfg = new HashMap<>(); + // esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); + esCfg.put("es.nodes", indexHost); + esCfg.put("es.mapping.id", idPath); + esCfg.put("es.batch.write.retry.count", "8"); + esCfg.put("es.batch.write.retry.wait", "60s"); + esCfg.put("es.batch.size.entries", "200"); + esCfg.put("es.nodes.wan.only", "true"); + JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java index 3130d8b982..d714155131 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java @@ -1,184 +1,200 @@ + package eu.dnetlib.dhp.provision.scholix; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.DHPUtils; + public class Scholix implements Serializable { - private String publicationDate; + private String publicationDate; - private List publisher; + private List publisher; - private List linkprovider; + private List linkprovider; - private ScholixRelationship relationship; + private ScholixRelationship relationship; - private ScholixResource source; + private ScholixResource source; - private ScholixResource target; + private ScholixResource target; - private String identifier; + private String identifier; - public Scholix clone(final ScholixResource t) { - final Scholix clone = new Scholix(); - clone.setPublicationDate(publicationDate); - clone.setPublisher(publisher); - clone.setLinkprovider(linkprovider); - clone.setRelationship(relationship); - clone.setSource(source); - clone.setTarget(t); - clone.generatelinkPublisher(); - clone.generateIdentifier(); - return clone; - } + public Scholix clone(final ScholixResource t) { + final Scholix clone = new Scholix(); + clone.setPublicationDate(publicationDate); + clone.setPublisher(publisher); + clone.setLinkprovider(linkprovider); + clone.setRelationship(relationship); + clone.setSource(source); + clone.setTarget(t); + clone.generatelinkPublisher(); + clone.generateIdentifier(); + return clone; + } - public static Scholix generateScholixWithSource( - final String sourceSummaryJson, final String relation) { - final ObjectMapper mapper = new ObjectMapper(); + public static Scholix generateScholixWithSource( + final String sourceSummaryJson, final String relation) { + final ObjectMapper mapper = new ObjectMapper(); - try { - ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); - Relation rel = mapper.readValue(relation, Relation.class); - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider( - rel.getCollectedfrom().stream() - .map( - cf -> - new ScholixEntityId( - cf.getValue(), - Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); - return s; - } catch (Throwable e) { - throw new RuntimeException( - String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); - } - } + try { + ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); + Relation rel = mapper.readValue(relation, Relation.class); + final Scholix s = new Scholix(); + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) + s.setPublicationDate(scholixSummary.getDate().get(0)); + s + .setLinkprovider( + rel + .getCollectedfrom() + .stream() + .map( + cf -> new ScholixEntityId( + cf.getValue(), + Collections + .singletonList( + new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); + s.setSource(ScholixResource.fromSummary(scholixSummary)); + return s; + } catch (Throwable e) { + throw new RuntimeException( + String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); + } + } - public static Scholix generateScholixWithSource( - final ScholixSummary scholixSummary, final Relation rel) { - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider( - rel.getCollectedfrom().stream() - .map( - cf -> - new ScholixEntityId( - cf.getValue(), - Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); + public static Scholix generateScholixWithSource( + final ScholixSummary scholixSummary, final Relation rel) { + final Scholix s = new Scholix(); + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) + s.setPublicationDate(scholixSummary.getDate().get(0)); + s + .setLinkprovider( + rel + .getCollectedfrom() + .stream() + .map( + cf -> new ScholixEntityId( + cf.getValue(), + Collections + .singletonList( + new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); + s.setSource(ScholixResource.fromSummary(scholixSummary)); - s.setIdentifier(rel.getTarget()); - // ScholixResource mockTarget = new ScholixResource(); - // mockTarget.setDnetIdentifier(rel.getTarget()); - // s.setTarget(mockTarget); - // s.generateIdentifier(); - return s; - } + s.setIdentifier(rel.getTarget()); + // ScholixResource mockTarget = new ScholixResource(); + // mockTarget.setDnetIdentifier(rel.getTarget()); + // s.setTarget(mockTarget); + // s.generateIdentifier(); + return s; + } - public void generatelinkPublisher() { - Set publisher = new HashSet<>(); - if (source.getPublisher() != null) - publisher.addAll( - source.getPublisher().stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - if (target.getPublisher() != null) - publisher.addAll( - target.getPublisher().stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - this.publisher = - publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList()); - } + public void generatelinkPublisher() { + Set publisher = new HashSet<>(); + if (source.getPublisher() != null) + publisher + .addAll( + source + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); + if (target.getPublisher() != null) + publisher + .addAll( + target + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); + this.publisher = publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList()); + } - public void generateIdentifier() { - setIdentifier( - DHPUtils.md5( - String.format( - "%s::%s::%s", - source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier()))); - } + public void generateIdentifier() { + setIdentifier( + DHPUtils + .md5( + String + .format( + "%s::%s::%s", + source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier()))); + } - public Scholix addTarget(final String targetSummaryJson) { - final ObjectMapper mapper = new ObjectMapper(); + public Scholix addTarget(final String targetSummaryJson) { + final ObjectMapper mapper = new ObjectMapper(); - try { - ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); - setTarget(ScholixResource.fromSummary(targetSummary)); - generateIdentifier(); - return this; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + try { + ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); + setTarget(ScholixResource.fromSummary(targetSummary)); + generateIdentifier(); + return this; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public String getPublicationDate() { - return publicationDate; - } + public String getPublicationDate() { + return publicationDate; + } - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } + public void setPublicationDate(String publicationDate) { + this.publicationDate = publicationDate; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public List getLinkprovider() { - return linkprovider; - } + public List getLinkprovider() { + return linkprovider; + } - public void setLinkprovider(List linkprovider) { - this.linkprovider = linkprovider; - } + public void setLinkprovider(List linkprovider) { + this.linkprovider = linkprovider; + } - public ScholixRelationship getRelationship() { - return relationship; - } + public ScholixRelationship getRelationship() { + return relationship; + } - public void setRelationship(ScholixRelationship relationship) { - this.relationship = relationship; - } + public void setRelationship(ScholixRelationship relationship) { + this.relationship = relationship; + } - public ScholixResource getSource() { - return source; - } + public ScholixResource getSource() { + return source; + } - public void setSource(ScholixResource source) { - this.source = source; - } + public void setSource(ScholixResource source) { + this.source = source; + } - public ScholixResource getTarget() { - return target; - } + public ScholixResource getTarget() { + return target; + } - public void setTarget(ScholixResource target) { - this.target = target; - } + public void setTarget(ScholixResource target) { + this.target = target; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java index c55bbb111f..9ce071fbc2 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java @@ -1,43 +1,45 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixCollectedFrom implements Serializable { - private ScholixEntityId provider; - private String provisionMode; - private String completionStatus; + private ScholixEntityId provider; + private String provisionMode; + private String completionStatus; - public ScholixCollectedFrom() {} + public ScholixCollectedFrom() { + } - public ScholixCollectedFrom( - ScholixEntityId provider, String provisionMode, String completionStatus) { - this.provider = provider; - this.provisionMode = provisionMode; - this.completionStatus = completionStatus; - } + public ScholixCollectedFrom( + ScholixEntityId provider, String provisionMode, String completionStatus) { + this.provider = provider; + this.provisionMode = provisionMode; + this.completionStatus = completionStatus; + } - public ScholixEntityId getProvider() { - return provider; - } + public ScholixEntityId getProvider() { + return provider; + } - public void setProvider(ScholixEntityId provider) { - this.provider = provider; - } + public void setProvider(ScholixEntityId provider) { + this.provider = provider; + } - public String getProvisionMode() { - return provisionMode; - } + public String getProvisionMode() { + return provisionMode; + } - public void setProvisionMode(String provisionMode) { - this.provisionMode = provisionMode; - } + public void setProvisionMode(String provisionMode) { + this.provisionMode = provisionMode; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java index 226c3d20a3..e797017bc7 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java @@ -1,32 +1,34 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; import java.util.List; public class ScholixEntityId implements Serializable { - private String name; - private List identifiers; + private String name; + private List identifiers; - public ScholixEntityId() {} + public ScholixEntityId() { + } - public ScholixEntityId(String name, List identifiers) { - this.name = name; - this.identifiers = identifiers; - } + public ScholixEntityId(String name, List identifiers) { + this.name = name; + this.identifiers = identifiers; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java index 265ac1ef50..0dd15336a4 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixIdentifier implements Serializable { - private String identifier; - private String schema; + private String identifier; + private String schema; - public ScholixIdentifier() {} + public ScholixIdentifier() { + } - public ScholixIdentifier(String identifier, String schema) { - this.identifier = identifier; - this.schema = schema; - } + public ScholixIdentifier(String identifier, String schema) { + this.identifier = identifier; + this.schema = schema; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } - public String getSchema() { - return schema; - } + public String getSchema() { + return schema; + } - public void setSchema(String schema) { - this.schema = schema; - } + public void setSchema(String schema) { + this.schema = schema; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java index b363eff2cf..0cbdf43e79 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java @@ -1,41 +1,43 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixRelationship implements Serializable { - private String name; - private String schema; - private String inverse; + private String name; + private String schema; + private String inverse; - public ScholixRelationship() {} + public ScholixRelationship() { + } - public ScholixRelationship(String name, String schema, String inverse) { - this.name = name; - this.schema = schema; - this.inverse = inverse; - } + public ScholixRelationship(String name, String schema, String inverse) { + this.name = name; + this.schema = schema; + this.inverse = inverse; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSchema() { - return schema; - } + public String getSchema() { + return schema; + } - public void setSchema(String schema) { - this.schema = schema; - } + public void setSchema(String schema) { + this.schema = schema; + } - public String getInverse() { - return inverse; - } + public String getInverse() { + return inverse; + } - public void setInverse(String inverse) { - this.inverse = inverse; - } + public void setInverse(String inverse) { + this.inverse = inverse; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java index 89342d2814..6de30c7481 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java @@ -1,137 +1,151 @@ + package eu.dnetlib.dhp.provision.scholix; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import java.io.Serializable; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class ScholixResource implements Serializable { - private List identifier; - private String dnetIdentifier; - private String objectType; - private String objectSubType; - private String title; - private List creator; - private String publicationDate; - private List publisher; - private List collectedFrom; + private List identifier; + private String dnetIdentifier; + private String objectType; + private String objectSubType; + private String title; + private List creator; + private String publicationDate; + private List publisher; + private List collectedFrom; - public static ScholixResource fromSummary(ScholixSummary summary) { + public static ScholixResource fromSummary(ScholixSummary summary) { - final ScholixResource resource = new ScholixResource(); + final ScholixResource resource = new ScholixResource(); - resource.setDnetIdentifier(summary.getId()); + resource.setDnetIdentifier(summary.getId()); - resource.setIdentifier( - summary.getLocalIdentifier().stream() - .map(i -> new ScholixIdentifier(i.getId(), i.getType())) - .collect(Collectors.toList())); + resource + .setIdentifier( + summary + .getLocalIdentifier() + .stream() + .map(i -> new ScholixIdentifier(i.getId(), i.getType())) + .collect(Collectors.toList())); - resource.setObjectType(summary.getTypology().toString()); + resource.setObjectType(summary.getTypology().toString()); - if (summary.getTitle() != null && summary.getTitle().size() > 0) - resource.setTitle(summary.getTitle().get(0)); + if (summary.getTitle() != null && summary.getTitle().size() > 0) + resource.setTitle(summary.getTitle().get(0)); - if (summary.getAuthor() != null) - resource.setCreator( - summary.getAuthor().stream() - .map(c -> new ScholixEntityId(c, null)) - .collect(Collectors.toList())); + if (summary.getAuthor() != null) + resource + .setCreator( + summary + .getAuthor() + .stream() + .map(c -> new ScholixEntityId(c, null)) + .collect(Collectors.toList())); - if (summary.getDate() != null && summary.getDate().size() > 0) - resource.setPublicationDate(summary.getDate().get(0)); - if (summary.getPublisher() != null) - resource.setPublisher( - summary.getPublisher().stream() - .map(p -> new ScholixEntityId(p, null)) - .collect(Collectors.toList())); - if (summary.getDatasources() != null) - resource.setCollectedFrom( - summary.getDatasources().stream() - .map( - d -> - new ScholixCollectedFrom( - new ScholixEntityId( - d.getDatasourceName(), - Collections.singletonList( - new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))), - "collected", - d.getCompletionStatus())) - .collect(Collectors.toList())); - return resource; - } + if (summary.getDate() != null && summary.getDate().size() > 0) + resource.setPublicationDate(summary.getDate().get(0)); + if (summary.getPublisher() != null) + resource + .setPublisher( + summary + .getPublisher() + .stream() + .map(p -> new ScholixEntityId(p, null)) + .collect(Collectors.toList())); + if (summary.getDatasources() != null) + resource + .setCollectedFrom( + summary + .getDatasources() + .stream() + .map( + d -> new ScholixCollectedFrom( + new ScholixEntityId( + d.getDatasourceName(), + Collections + .singletonList( + new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))), + "collected", + d.getCompletionStatus())) + .collect(Collectors.toList())); + return resource; + } - public List getIdentifier() { - return identifier; - } + public List getIdentifier() { + return identifier; + } - public void setIdentifier(List identifier) { - this.identifier = identifier; - } + public void setIdentifier(List identifier) { + this.identifier = identifier; + } - public String getDnetIdentifier() { - return dnetIdentifier; - } + public String getDnetIdentifier() { + return dnetIdentifier; + } - public void setDnetIdentifier(String dnetIdentifier) { - this.dnetIdentifier = dnetIdentifier; - } + public void setDnetIdentifier(String dnetIdentifier) { + this.dnetIdentifier = dnetIdentifier; + } - public String getObjectType() { - return objectType; - } + public String getObjectType() { + return objectType; + } - public void setObjectType(String objectType) { - this.objectType = objectType; - } + public void setObjectType(String objectType) { + this.objectType = objectType; + } - public String getObjectSubType() { - return objectSubType; - } + public String getObjectSubType() { + return objectSubType; + } - public void setObjectSubType(String objectSubType) { - this.objectSubType = objectSubType; - } + public void setObjectSubType(String objectSubType) { + this.objectSubType = objectSubType; + } - public String getTitle() { - return title; - } + public String getTitle() { + return title; + } - public void setTitle(String title) { - this.title = title; - } + public void setTitle(String title) { + this.title = title; + } - public List getCreator() { - return creator; - } + public List getCreator() { + return creator; + } - public void setCreator(List creator) { - this.creator = creator; - } + public void setCreator(List creator) { + this.creator = creator; + } - public String getPublicationDate() { - return publicationDate; - } + public String getPublicationDate() { + return publicationDate; + } - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } + public void setPublicationDate(String publicationDate) { + this.publicationDate = publicationDate; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } + public void setCollectedFrom(List collectedFrom) { + this.collectedFrom = collectedFrom; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java index 0d58eacd61..6d6f46f544 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java @@ -1,42 +1,44 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class CollectedFromType implements Serializable { - private String datasourceName; - private String datasourceId; - private String completionStatus; + private String datasourceName; + private String datasourceId; + private String completionStatus; - public CollectedFromType() {} + public CollectedFromType() { + } - public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { - this.datasourceName = datasourceName; - this.datasourceId = datasourceId; - this.completionStatus = completionStatus; - } + public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { + this.datasourceName = datasourceName; + this.datasourceId = datasourceId; + this.completionStatus = completionStatus; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java index 8338e39956..e9d94fccf9 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class SchemeValue implements Serializable { - private String scheme; - private String value; + private String scheme; + private String value; - public SchemeValue() {} + public SchemeValue() { + } - public SchemeValue(String scheme, String value) { - this.scheme = scheme; - this.value = value; - } + public SchemeValue(String scheme, String value) { + this.scheme = scheme; + this.value = value; + } - public String getScheme() { - return scheme; - } + public String getScheme() { + return scheme; + } - public void setScheme(String scheme) { - this.scheme = scheme; - } + public void setScheme(String scheme) { + this.scheme = scheme; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java index 50aa2a75cc..e5ea8b9f5a 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java @@ -1,313 +1,353 @@ + package eu.dnetlib.dhp.provision.scholix.summary; +import java.io.Serializable; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.provision.RelatedItemInfo; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import java.io.Serializable; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; public class ScholixSummary implements Serializable { - private String id; - private List localIdentifier; - private Typology typology; - private List title; - private List author; - private List date; - private String description; - private List subject; - private List publisher; - private long relatedPublications; - private long relatedDatasets; - private long relatedUnknown; - private List datasources; + private String id; + private List localIdentifier; + private Typology typology; + private List title; + private List author; + private List date; + private String description; + private List subject; + private List publisher; + private long relatedPublications; + private long relatedDatasets; + private long relatedUnknown; + private List datasources; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getLocalIdentifier() { - return localIdentifier; - } + public List getLocalIdentifier() { + return localIdentifier; + } - public void setLocalIdentifier(List localIdentifier) { - this.localIdentifier = localIdentifier; - } + public void setLocalIdentifier(List localIdentifier) { + this.localIdentifier = localIdentifier; + } - public Typology getTypology() { - return typology; - } + public Typology getTypology() { + return typology; + } - public void setTypology(Typology typology) { - this.typology = typology; - } + public void setTypology(Typology typology) { + this.typology = typology; + } - public List getTitle() { - return title; - } + public List getTitle() { + return title; + } - public void setTitle(List title) { - this.title = title; - } + public void setTitle(List title) { + this.title = title; + } - public List getAuthor() { - return author; - } + public List getAuthor() { + return author; + } - public void setAuthor(List author) { - this.author = author; - } + public void setAuthor(List author) { + this.author = author; + } - public List getDate() { - return date; - } + public List getDate() { + return date; + } - public void setDate(List date) { - this.date = date; - } + public void setDate(List date) { + this.date = date; + } - @JsonProperty("abstract") - public String getDescription() { - return description; - } + @JsonProperty("abstract") + public String getDescription() { + return description; + } - @JsonProperty("abstract") - public void setDescription(String description) { - this.description = description; - } + @JsonProperty("abstract") + public void setDescription(String description) { + this.description = description; + } - public List getSubject() { - return subject; - } + public List getSubject() { + return subject; + } - public void setSubject(List subject) { - this.subject = subject; - } + public void setSubject(List subject) { + this.subject = subject; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public long getRelatedPublications() { - return relatedPublications; - } + public long getRelatedPublications() { + return relatedPublications; + } - public void setRelatedPublications(long relatedPublications) { - this.relatedPublications = relatedPublications; - } + public void setRelatedPublications(long relatedPublications) { + this.relatedPublications = relatedPublications; + } - public long getRelatedDatasets() { - return relatedDatasets; - } + public long getRelatedDatasets() { + return relatedDatasets; + } - public void setRelatedDatasets(long relatedDatasets) { - this.relatedDatasets = relatedDatasets; - } + public void setRelatedDatasets(long relatedDatasets) { + this.relatedDatasets = relatedDatasets; + } - public long getRelatedUnknown() { - return relatedUnknown; - } + public long getRelatedUnknown() { + return relatedUnknown; + } - public void setRelatedUnknown(long relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } + public void setRelatedUnknown(long relatedUnknown) { + this.relatedUnknown = relatedUnknown; + } - public List getDatasources() { - return datasources; - } + public List getDatasources() { + return datasources; + } - public void setDatasources(List datasources) { - this.datasources = datasources; - } + public void setDatasources(List datasources) { + this.datasources = datasources; + } - public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) { - try { - final ObjectMapper mapper = new ObjectMapper(); - final RelatedItemInfo relatedItemInfo = new RelatedItemInfo(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - switch (oafType) { - case dataset: - return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); - case publication: - return summaryFromPublication( - mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); - case unknown: - return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - return null; - } + public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) { + try { + final ObjectMapper mapper = new ObjectMapper(); + final RelatedItemInfo relatedItemInfo = new RelatedItemInfo(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + switch (oafType) { + case dataset: + return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); + case publication: + return summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); + case unknown: + return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + return null; + } - public static String fromJsonOAF( - final Typology oafType, final String oafJson, final String relEntityJson) { - try { - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + public static String fromJsonOAF( + final Typology oafType, final String oafJson, final String relEntityJson) { + try { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class); + RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class); - switch (oafType) { - case dataset: - return mapper.writeValueAsString( - summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); - case publication: - return mapper.writeValueAsString( - summaryFromPublication( - mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo)); - case unknown: - return mapper.writeValueAsString( - summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); - } + switch (oafType) { + case dataset: + return mapper + .writeValueAsString( + summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); + case publication: + return mapper + .writeValueAsString( + summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo)); + case unknown: + return mapper + .writeValueAsString( + summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); + } - } catch (Throwable e) { - throw new RuntimeException(e); - } + } catch (Throwable e) { + throw new RuntimeException(e); + } - return null; - } + return null; + } - private static ScholixSummary summaryFromDataset( - final DLIDataset item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); + private static ScholixSummary summaryFromDataset( + final DLIDataset item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setTypology(Typology.dataset); - if (item.getTitle() != null) - summary.setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTypology(Typology.dataset); + if (item.getTitle() != null) + summary + .setTitle( + item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - if (item.getAuthor() != null) { - summary.setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } + if (item.getAuthor() != null) { + summary + .setAuthor( + item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + } - if (item.getRelevantdate() != null) - summary.setDate( - item.getRelevantdate().stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); + if (item.getRelevantdate() != null) + summary + .setDate( + item + .getRelevantdate() + .stream() + .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); + if (item.getDescription() != null && item.getDescription().size() > 0) + summary.setDescription(item.getDescription().get(0).getValue()); - if (item.getSubject() != null) { - summary.setSubject( - item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); + if (item.getSubject() != null) { + summary + .setSubject( + item + .getSubject() + .stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); + } + if (item.getPublisher() != null) + summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); + return summary; + } - private static ScholixSummary summaryFromPublication( - final DLIPublication item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); + private static ScholixSummary summaryFromPublication( + final DLIPublication item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setTypology(Typology.publication); - if (item.getTitle() != null) - summary.setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTypology(Typology.publication); + if (item.getTitle() != null) + summary + .setTitle( + item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - if (item.getAuthor() != null) { - summary.setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } + if (item.getAuthor() != null) { + summary + .setAuthor( + item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + } - if (item.getRelevantdate() != null) - summary.setDate( - item.getRelevantdate().stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); + if (item.getRelevantdate() != null) + summary + .setDate( + item + .getRelevantdate() + .stream() + .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); + if (item.getDescription() != null && item.getDescription().size() > 0) + summary.setDescription(item.getDescription().get(0).getValue()); - if (item.getSubject() != null) { - summary.setSubject( - item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } + if (item.getSubject() != null) { + summary + .setSubject( + item + .getSubject() + .stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); + } - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); + if (item.getPublisher() != null) + summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); - return summary; - } + return summary; + } - private static ScholixSummary summaryFromUnknown( - final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + private static ScholixSummary summaryFromUnknown( + final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - summary.setTypology(Typology.unknown); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setTypology(Typology.unknown); + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); + return summary; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java index 773695effd..c4148ad242 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class TypedIdentifier implements Serializable { - private String id; - private String type; + private String id; + private String type; - public TypedIdentifier() {} + public TypedIdentifier() { + } - public TypedIdentifier(String id, String type) { - this.id = id; - this.type = type; - } + public TypedIdentifier(String id, String type) { + this.id = id; + this.type = type; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java index d90e224f9c..effa32b6bc 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java @@ -1,9 +1,8 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public enum Typology implements Serializable { - dataset, - publication, - unknown + dataset, publication, unknown } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java index 7e8e7aef3f..bc9562e08b 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java @@ -1,121 +1,131 @@ + package eu.dnetlib.dhp.provision.update; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; + import eu.dnetlib.dhp.provision.scholix.ScholixCollectedFrom; import eu.dnetlib.dhp.provision.scholix.ScholixEntityId; import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.utils.DHPUtils; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; public class CrossRefParserJSON { - private static List collectedFrom = - generateCrossrefCollectedFrom("complete"); + private static List collectedFrom = generateCrossrefCollectedFrom("complete"); - public static ScholixResource parseRecord(final String record) { - if (record == null) return null; - JsonElement jElement = new JsonParser().parse(record); - JsonElement source = null; - if (jElement.getAsJsonObject().has("_source")) { - source = jElement.getAsJsonObject().get("_source"); - if (source == null || !source.isJsonObject()) return null; - } else if (jElement.getAsJsonObject().has("DOI")) { - source = jElement; - } else { - return null; - } + public static ScholixResource parseRecord(final String record) { + if (record == null) + return null; + JsonElement jElement = new JsonParser().parse(record); + JsonElement source = null; + if (jElement.getAsJsonObject().has("_source")) { + source = jElement.getAsJsonObject().get("_source"); + if (source == null || !source.isJsonObject()) + return null; + } else if (jElement.getAsJsonObject().has("DOI")) { + source = jElement; + } else { + return null; + } - final JsonObject message = source.getAsJsonObject(); - ScholixResource currentObject = new ScholixResource(); + final JsonObject message = source.getAsJsonObject(); + ScholixResource currentObject = new ScholixResource(); - if (message.get("DOI") != null) { - final String doi = message.get("DOI").getAsString(); - currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - } + if (message.get("DOI") != null) { + final String doi = message.get("DOI").getAsString(); + currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); + } - if ((!message.get("created").isJsonNull()) - && (message.getAsJsonObject("created").get("date-time") != null)) { - currentObject.setPublicationDate( - message.getAsJsonObject("created").get("date-time").getAsString()); - } + if ((!message.get("created").isJsonNull()) + && (message.getAsJsonObject("created").get("date-time") != null)) { + currentObject + .setPublicationDate( + message.getAsJsonObject("created").get("date-time").getAsString()); + } - if (message.get("title") != null - && !message.get("title").isJsonNull() - && message.get("title").isJsonArray()) { + if (message.get("title") != null + && !message.get("title").isJsonNull() + && message.get("title").isJsonArray()) { - JsonArray array = message.get("title").getAsJsonArray(); - currentObject.setTitle(array.get(0).getAsString()); - } - if (message.get("author") != null && !message.get("author").isJsonNull()) { - JsonArray author = message.getAsJsonArray("author"); - List authorList = new ArrayList<>(); - for (JsonElement anAuthor : author) { - JsonObject currentAuth = anAuthor.getAsJsonObject(); + JsonArray array = message.get("title").getAsJsonArray(); + currentObject.setTitle(array.get(0).getAsString()); + } + if (message.get("author") != null && !message.get("author").isJsonNull()) { + JsonArray author = message.getAsJsonArray("author"); + List authorList = new ArrayList<>(); + for (JsonElement anAuthor : author) { + JsonObject currentAuth = anAuthor.getAsJsonObject(); - String family = ""; - String given = ""; - if (currentAuth != null - && currentAuth.get("family") != null - && !currentAuth.get("family").isJsonNull()) { - family = currentAuth.get("family").getAsString(); - } - if (currentAuth != null - && currentAuth.get("given") != null - && !currentAuth.get("given").isJsonNull()) { - given = currentAuth.get("given").getAsString(); - } - authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); - } - currentObject.setCreator(authorList); - } - if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { - currentObject.setPublisher( - Collections.singletonList( - new ScholixEntityId(message.get("publisher").getAsString(), null))); - } - currentObject.setCollectedFrom(collectedFrom); - currentObject.setObjectType("publication"); - currentObject.setDnetIdentifier( - generateId(message.get("DOI").getAsString(), "doi", "publication")); + String family = ""; + String given = ""; + if (currentAuth != null + && currentAuth.get("family") != null + && !currentAuth.get("family").isJsonNull()) { + family = currentAuth.get("family").getAsString(); + } + if (currentAuth != null + && currentAuth.get("given") != null + && !currentAuth.get("given").isJsonNull()) { + given = currentAuth.get("given").getAsString(); + } + authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); + } + currentObject.setCreator(authorList); + } + if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { + currentObject + .setPublisher( + Collections + .singletonList( + new ScholixEntityId(message.get("publisher").getAsString(), null))); + } + currentObject.setCollectedFrom(collectedFrom); + currentObject.setObjectType("publication"); + currentObject + .setDnetIdentifier( + generateId(message.get("DOI").getAsString(), "doi", "publication")); - return currentObject; - } + return currentObject; + } - private static List generateCrossrefCollectedFrom( - final String completionStatus) { - final ScholixEntityId scholixEntityId = - new ScholixEntityId( - "Crossref", - Collections.singletonList( - new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); - return Collections.singletonList( - new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); - } + private static List generateCrossrefCollectedFrom( + final String completionStatus) { + final ScholixEntityId scholixEntityId = new ScholixEntityId( + "Crossref", + Collections + .singletonList( + new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); + return Collections + .singletonList( + new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); + } - private static String generateId( - final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + private static String generateId( + final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java index e5aa38c1d6..fac1da2539 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java @@ -1,10 +1,9 @@ + package eu.dnetlib.dhp.provision.update; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; import java.io.ByteArrayOutputStream; import java.util.zip.Inflater; + import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; @@ -12,77 +11,81 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.provision.scholix.ScholixResource; + public class CrossrefClient { - private String host; - private String index = "crossref"; - private String indexType = "item"; + private String host; + private String index = "crossref"; + private String indexType = "item"; - public CrossrefClient(String host) { - this.host = host; - } + public CrossrefClient(String host) { + this.host = host; + } - public String getHost() { - return host; - } + public String getHost() { + return host; + } - public void setHost(String host) { - this.host = host; - } + public void setHost(String host) { + this.host = host; + } - public String getIndex() { - return index; - } + public String getIndex() { + return index; + } - public void setIndex(String index) { - this.index = index; - } + public void setIndex(String index) { + this.index = index; + } - public String getIndexType() { - return indexType; - } + public String getIndexType() { + return indexType; + } - public void setIndexType(String indexType) { - this.indexType = indexType; - } + public void setIndexType(String indexType) { + this.indexType = indexType; + } - private static String decompressBlob(final String blob) { - try { - byte[] byteArray = Base64.decodeBase64(blob.getBytes()); - final Inflater decompresser = new Inflater(); - decompresser.setInput(byteArray); - final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); - byte[] buffer = new byte[8192]; - while (!decompresser.finished()) { - int size = decompresser.inflate(buffer); - bos.write(buffer, 0, size); - } - byte[] unzippeddata = bos.toByteArray(); - decompresser.end(); - return new String(unzippeddata); - } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob, e); - } - } + private static String decompressBlob(final String blob) { + try { + byte[] byteArray = Base64.decodeBase64(blob.getBytes()); + final Inflater decompresser = new Inflater(); + decompresser.setInput(byteArray); + final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); + byte[] buffer = new byte[8192]; + while (!decompresser.finished()) { + int size = decompresser.inflate(buffer); + bos.write(buffer, 0, size); + } + byte[] unzippeddata = bos.toByteArray(); + decompresser.end(); + return new String(unzippeddata); + } catch (Throwable e) { + throw new RuntimeException("Wrong record:" + blob, e); + } + } - public ScholixResource getResourceByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = - new HttpGet( - String.format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - String json = IOUtils.toString(response.getEntity().getContent()); - if (json.contains("blob")) { - JsonParser p = new JsonParser(); - final JsonElement root = p.parse(json); - json = - decompressBlob( - root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); - } - return CrossRefParserJSON.parseRecord(json); - } catch (Throwable e) { - return null; - } - } + public ScholixResource getResourceByDOI(final String doi) { + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet( + String + .format( + "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); + CloseableHttpResponse response = client.execute(httpGet); + String json = IOUtils.toString(response.getEntity().getContent()); + if (json.contains("blob")) { + JsonParser p = new JsonParser(); + final JsonElement root = p.parse(json); + json = decompressBlob( + root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); + } + return CrossRefParserJSON.parseRecord(json); + } catch (Throwable e) { + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java index 3eed64d4df..10426b29c8 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java @@ -1,218 +1,229 @@ + package eu.dnetlib.dhp.provision.update; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.provision.scholix.*; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.scholexplorer.relation.RelInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.provision.scholix.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.scholexplorer.relation.RelInfo; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class Datacite2Scholix { - private String rootPath = "$.attributes"; - final RelationMapper relationMapper; + private String rootPath = "$.attributes"; + final RelationMapper relationMapper; - public Datacite2Scholix(RelationMapper relationMapper) { - this.relationMapper = relationMapper; - } + public Datacite2Scholix(RelationMapper relationMapper) { + this.relationMapper = relationMapper; + } - public List generateScholixFromJson(final String dJson) { - List> relIds = getRelatedIendtifiers(dJson); - relIds = - relIds != null - ? relIds.stream() - .filter( - m -> - m.containsKey("relatedIdentifierType") - && m.containsKey("relationType") - && m.containsKey("relatedIdentifier")) - .collect(Collectors.toList()) - : null; - if (relIds == null || relIds.size() == 0) return null; + public List generateScholixFromJson(final String dJson) { + List> relIds = getRelatedIendtifiers(dJson); + relIds = relIds != null + ? relIds + .stream() + .filter( + m -> m.containsKey("relatedIdentifierType") + && m.containsKey("relationType") + && m.containsKey("relatedIdentifier")) + .collect(Collectors.toList()) + : null; + if (relIds == null || relIds.size() == 0) + return null; - final String updated = JsonPath.read(dJson, rootPath + ".updated"); - ScholixResource resource = generateDataciteScholixResource(dJson); + final String updated = JsonPath.read(dJson, rootPath + ".updated"); + ScholixResource resource = generateDataciteScholixResource(dJson); - return relIds.stream() - .flatMap( - s -> { - try { - final List result = - generateScholix( - resource, - "" + s.get("relatedIdentifier"), - s.get("relatedIdentifierType"), - s.get("relationType"), - updated); - return result.stream(); - } catch (Throwable e) { - return new ArrayList().stream(); - } - }) - .collect(Collectors.toList()); - } + return relIds + .stream() + .flatMap( + s -> { + try { + final List result = generateScholix( + resource, + "" + s.get("relatedIdentifier"), + s.get("relatedIdentifierType"), + s.get("relationType"), + updated); + return result.stream(); + } catch (Throwable e) { + return new ArrayList().stream(); + } + }) + .collect(Collectors.toList()); + } - public String getRootPath() { - return rootPath; - } + public String getRootPath() { + return rootPath; + } - public void setRootPath(String rootPath) { - this.rootPath = rootPath; - } + public void setRootPath(String rootPath) { + this.rootPath = rootPath; + } - private List generateScholix( - ScholixResource source, - final String pid, - final String pidtype, - final String relType, - final String updated) { + private List generateScholix( + ScholixResource source, + final String pid, + final String pidtype, + final String relType, + final String updated) { - if ("doi".equalsIgnoreCase(pidtype)) { - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = - new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - return Collections.singletonList(s); - } else { - final List result = new ArrayList<>(); - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - target.setDnetIdentifier(generateId(pid, pidtype, "unknown")); - target.setObjectType("unknown"); - target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = - new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - s.generateIdentifier(); - result.add(s); - final Scholix s2 = new Scholix(); - s2.setSource(target); - s2.setTarget(source); - s2.setLinkprovider(Collections.singletonList(provider)); - s2.setPublisher(source.getPublisher()); - s2.setRelationship( - new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); - s2.setPublicationDate(updated); - s2.generateIdentifier(); - result.add(s2); - return result; - } - } + if ("doi".equalsIgnoreCase(pidtype)) { + ScholixResource target = new ScholixResource(); + target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); + final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); + final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", + relInfo.getInverse()); + final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); + final Scholix s = new Scholix(); + s.setSource(source); + s.setTarget(target); + s.setLinkprovider(Collections.singletonList(provider)); + s.setPublisher(source.getPublisher()); + s.setRelationship(rel); + s.setPublicationDate(updated); + return Collections.singletonList(s); + } else { + final List result = new ArrayList<>(); + ScholixResource target = new ScholixResource(); + target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); + target.setDnetIdentifier(generateId(pid, pidtype, "unknown")); + target.setObjectType("unknown"); + target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); + final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); + final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", + relInfo.getInverse()); + final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); + final Scholix s = new Scholix(); + s.setSource(source); + s.setTarget(target); + s.setLinkprovider(Collections.singletonList(provider)); + s.setPublisher(source.getPublisher()); + s.setRelationship(rel); + s.setPublicationDate(updated); + s.generateIdentifier(); + result.add(s); + final Scholix s2 = new Scholix(); + s2.setSource(target); + s2.setTarget(source); + s2.setLinkprovider(Collections.singletonList(provider)); + s2.setPublisher(source.getPublisher()); + s2 + .setRelationship( + new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); + s2.setPublicationDate(updated); + s2.generateIdentifier(); + result.add(s2); + return result; + } + } - public ScholixResource generateDataciteScholixResource(String dJson) { - ScholixResource resource = new ScholixResource(); - String DOI_PATH = rootPath + ".doi"; - final String doi = JsonPath.read(dJson, DOI_PATH); - resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - resource.setObjectType(getType(dJson)); - resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType())); - resource.setCollectedFrom(generateDataciteCollectedFrom("complete")); - final String publisher = JsonPath.read(dJson, rootPath + ".publisher"); - if (StringUtils.isNotBlank(publisher)) - resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); - final String date = getDate(dJson); - if (StringUtils.isNotBlank(date)) resource.setPublicationDate(date); - final String title = getTitle(dJson); - if (StringUtils.isNotBlank(title)) resource.setTitle(title); - resource.setCreator(getCreators(dJson)); - return resource; - } + public ScholixResource generateDataciteScholixResource(String dJson) { + ScholixResource resource = new ScholixResource(); + String DOI_PATH = rootPath + ".doi"; + final String doi = JsonPath.read(dJson, DOI_PATH); + resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); + resource.setObjectType(getType(dJson)); + resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType())); + resource.setCollectedFrom(generateDataciteCollectedFrom("complete")); + final String publisher = JsonPath.read(dJson, rootPath + ".publisher"); + if (StringUtils.isNotBlank(publisher)) + resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); + final String date = getDate(dJson); + if (StringUtils.isNotBlank(date)) + resource.setPublicationDate(date); + final String title = getTitle(dJson); + if (StringUtils.isNotBlank(title)) + resource.setTitle(title); + resource.setCreator(getCreators(dJson)); + return resource; + } - private List getCreators(final String json) { - final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); - if (creatorName != null && creatorName.size() > 0) { - return creatorName.stream() - .map(s -> new ScholixEntityId(s, null)) - .collect(Collectors.toList()); - } - return null; - } + private List getCreators(final String json) { + final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); + if (creatorName != null && creatorName.size() > 0) { + return creatorName + .stream() + .map(s -> new ScholixEntityId(s, null)) + .collect(Collectors.toList()); + } + return null; + } - private String getTitle(final String json) { - final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); - return titles != null && titles.size() > 0 ? titles.get(0) : null; - } + private String getTitle(final String json) { + final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); + return titles != null && titles.size() > 0 ? titles.get(0) : null; + } - private String getDate(final String json) { - final List> dates = JsonPath.read(json, rootPath + ".dates"); - if (dates != null && dates.size() > 0) { + private String getDate(final String json) { + final List> dates = JsonPath.read(json, rootPath + ".dates"); + if (dates != null && dates.size() > 0) { - List> issued = - dates.stream() - .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) - .collect(Collectors.toList()); - if (issued.size() > 0) return issued.get(0).get("date"); - } - return null; - } + List> issued = dates + .stream() + .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) + .collect(Collectors.toList()); + if (issued.size() > 0) + return issued.get(0).get("date"); + } + return null; + } - private List generateDataciteCollectedFrom(final String completionStatus) { - final ScholixEntityId scholixEntityId = - new ScholixEntityId( - "Datasets in Datacite", - Collections.singletonList( - new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); - return Collections.singletonList( - new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); - } + private List generateDataciteCollectedFrom(final String completionStatus) { + final ScholixEntityId scholixEntityId = new ScholixEntityId( + "Datasets in Datacite", + Collections + .singletonList( + new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); + return Collections + .singletonList( + new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); + } - private String getType(final String json) { - try { - final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex"); - if ("article".equalsIgnoreCase(bibtext)) { - return "publication"; - } - return "dataset"; - } catch (Throwable e) { - return "dataset"; - } - } + private String getType(final String json) { + try { + final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex"); + if ("article".equalsIgnoreCase(bibtext)) { + return "publication"; + } + return "dataset"; + } catch (Throwable e) { + return "dataset"; + } + } - private List> getRelatedIendtifiers(final String json) { - String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]"; - List> res = JsonPath.read(json, REL_IDENTIFIER_PATH); - return res; - } + private List> getRelatedIendtifiers(final String json) { + String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]"; + List> res = JsonPath.read(json, REL_IDENTIFIER_PATH); + return res; + } - public static String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + public static String generateId(final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java index a4e77b37c6..e84ec4376e 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java @@ -1,72 +1,75 @@ + package eu.dnetlib.dhp.provision.update; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; + public class DataciteClient { - private String host; - private String index = "datacite"; - private String indexType = "dump"; - private Datacite2Scholix d2s; + private String host; + private String index = "datacite"; + private String indexType = "dump"; + private Datacite2Scholix d2s; - public DataciteClient(String host) { - this.host = host; + public DataciteClient(String host) { + this.host = host; - d2s = new Datacite2Scholix(null); - d2s.setRootPath("$._source.attributes"); - } + d2s = new Datacite2Scholix(null); + d2s.setRootPath("$._source.attributes"); + } - public Iterable getDatasetsFromTs(final Long timestamp) { - return () -> { - try { - return new DataciteClientIterator(host, index, timestamp); - } catch (IOException e) { - throw new RuntimeException(e); - } - }; - } + public Iterable getDatasetsFromTs(final Long timestamp) { + return () -> { + try { + return new DataciteClientIterator(host, index, timestamp); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; + } - public String getHost() { - return host; - } + public String getHost() { + return host; + } - public void setHost(String host) { - this.host = host; - } + public void setHost(String host) { + this.host = host; + } - public String getIndex() { - return index; - } + public String getIndex() { + return index; + } - public void setIndex(String index) { - this.index = index; - } + public void setIndex(String index) { + this.index = index; + } - public String getIndexType() { - return indexType; - } + public String getIndexType() { + return indexType; + } - public void setIndexType(String indexType) { - this.indexType = indexType; - } + public void setIndexType(String indexType) { + this.indexType = indexType; + } - public ScholixResource getDatasetByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = - new HttpGet( - String.format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - final String json = IOUtils.toString(response.getEntity().getContent()); - return d2s.generateDataciteScholixResource(json); - } catch (Throwable e) { - return null; - } - } + public ScholixResource getDatasetByDOI(final String doi) { + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet( + String + .format( + "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); + CloseableHttpResponse response = client.execute(httpGet); + final String json = IOUtils.toString(response.getEntity().getContent()); + return d2s.generateDataciteScholixResource(json); + } catch (Throwable e) { + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java index fa9dc5646f..2c70c8b091 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java @@ -1,12 +1,11 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; @@ -14,103 +13,108 @@ import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; + +import net.minidev.json.JSONArray; + public class DataciteClientIterator implements Iterator { - static final String blobPath = "$.hits.hits[*]._source"; - static final String scrollIdPath = "$._scroll_id"; + static final String blobPath = "$.hits.hits[*]._source"; + static final String scrollIdPath = "$._scroll_id"; - String scrollId; + String scrollId; - List buffer; + List buffer; - final String esHost; - final String esIndex; - final ObjectMapper mapper = new ObjectMapper(); + final String esHost; + final String esIndex; + final ObjectMapper mapper = new ObjectMapper(); - public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) - throws IOException { + public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) + throws IOException { - this.esHost = esHost; - this.esIndex = esIndex; - // THIS FIX IS NECESSARY to avoid different timezone - timestamp -= (60 * 60 * 2); - final String body = - getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), - String.format( - "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } + this.esHost = esHost; + this.esIndex = esIndex; + // THIS FIX IS NECESSARY to avoid different timezone + timestamp -= (60 * 60 * 2); + final String body = getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + String + .format( + "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); + scrollId = getJPathString(scrollIdPath, body); + buffer = getBlobs(body); + } - public String getResponse(final String url, final String json) { - CloseableHttpClient client = HttpClients.createDefault(); - try { + public String getResponse(final String url, final String json) { + CloseableHttpClient client = HttpClients.createDefault(); + try { - HttpPost httpPost = new HttpPost(url); - if (json != null) { - StringEntity entity = new StringEntity(json); - httpPost.setEntity(entity); - httpPost.setHeader("Accept", "application/json"); - httpPost.setHeader("Content-type", "application/json"); - } - CloseableHttpResponse response = client.execute(httpPost); + HttpPost httpPost = new HttpPost(url); + if (json != null) { + StringEntity entity = new StringEntity(json); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + CloseableHttpResponse response = client.execute(httpPost); - return IOUtils.toString(response.getEntity().getContent()); - } catch (Throwable e) { - throw new RuntimeException("Error on executing request ", e); - } finally { - try { - client.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close client ", e); - } - } - } + return IOUtils.toString(response.getEntity().getContent()); + } catch (Throwable e) { + throw new RuntimeException("Error on executing request ", e); + } finally { + try { + client.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close client ", e); + } + } + } - private String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - return null; - } catch (Exception e) { - return ""; - } - } + private String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + return null; + } catch (Exception e) { + return ""; + } + } - private List getBlobs(final String body) { - JSONArray array = JsonPath.read(body, blobPath); - return array.stream() - .map( - o -> { - try { - return mapper.writeValueAsString(o); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } + private List getBlobs(final String body) { + JSONArray array = JsonPath.read(body, blobPath); + return array + .stream() + .map( + o -> { + try { + return mapper.writeValueAsString(o); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + } - @Override - public boolean hasNext() { - return (buffer != null && !buffer.isEmpty()); - } + @Override + public boolean hasNext() { + return (buffer != null && !buffer.isEmpty()); + } - @Override - public String next() { - final String nextItem = buffer.remove(0); - if (buffer.isEmpty()) { - final String json_param = - String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); - final String body = - getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); - try { - buffer = getBlobs(body); - } catch (Throwable e) { - System.out.println(body); - } - } - return nextItem; - } + @Override + public String next() { + final String nextItem = buffer.remove(0); + if (buffer.isEmpty()) { + final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); + final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + try { + buffer = getBlobs(body); + } catch (Throwable e) { + System.out.println(body); + } + } + return nextItem; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java index 15c396b10a..e876d05a12 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.net.URI; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -14,54 +12,61 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class RetrieveUpdateFromDatacite { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - RetrieveUpdateFromDatacite.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); - parser.parseArgument(args); - final String hdfsuri = parser.get("namenode"); - Path hdfswritepath = new Path(parser.get("targetPath")); - final long timestamp = Long.parseLong(parser.get("timestamp")); - final String host = parser.get("indexHost"); - final String index = parser.get("indexName"); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + RetrieveUpdateFromDatacite.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); + parser.parseArgument(args); + final String hdfsuri = parser.get("namenode"); + Path hdfswritepath = new Path(parser.get("targetPath")); + final long timestamp = Long.parseLong(parser.get("timestamp")); + final String host = parser.get("indexHost"); + final String index = parser.get("indexName"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem.get(URI.create(hdfsuri), conf); - final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); - final ObjectMapper mapper = new ObjectMapper(); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final Text value = new Text(); - final IntWritable key = new IntWritable(); - int i = 0; - for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { - i++; - List scholix = d2s.generateScholixFromJson(dataset); - if (scholix != null) - for (Scholix s : scholix) { - key.set(i); - value.set(mapper.writeValueAsString(s)); - writer.append(key, value); - if (i % 10000 == 0) { - System.out.println("wrote " + i); - } - } - } - } - } + FileSystem.get(URI.create(hdfsuri), conf); + final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); + final ObjectMapper mapper = new ObjectMapper(); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final Text value = new Text(); + final IntWritable key = new IntWritable(); + int i = 0; + for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { + i++; + List scholix = d2s.generateScholixFromJson(dataset); + if (scholix != null) + for (Scholix s : scholix) { + key.set(i); + value.set(mapper.writeValueAsString(s)); + writer.append(key, value); + if (i % 10000 == 0) { + System.out.println("wrote " + i); + } + } + } + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java index 09a5c7c3d0..981c471aea 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java @@ -1,16 +1,11 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; -import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -20,150 +15,170 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; +import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkResolveScholixTarget { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkResolveScholixTarget.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkResolveScholixTarget.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); + parser.parseArgument(args); - final SparkConf conf = new SparkConf(); + final SparkConf conf = new SparkConf(); - final String master = parser.get("master"); - final String sourcePath = parser.get("sourcePath"); - final String workingDirPath = parser.get("workingDirPath"); - final String indexHost = parser.get("indexHost"); - try (SparkSession spark = getSession(conf, master)) { + final String master = parser.get("master"); + final String sourcePath = parser.get("sourcePath"); + final String workingDirPath = parser.get("workingDirPath"); + final String indexHost = parser.get("indexHost"); + try (SparkSession spark = getSession(conf, master)) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - spark - .createDataset( - sc.sequenceFile(sourcePath, IntWritable.class, Text.class) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class)) - .rdd(), - Encoders.bean(Scholix.class)) - .write() - .save(workingDirPath + "/stepA"); + spark + .createDataset( + sc + .sequenceFile(sourcePath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class)) + .rdd(), + Encoders.bean(Scholix.class)) + .write() + .save(workingDirPath + "/stepA"); - Dataset s1 = - spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); + Dataset s1 = spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); - s1.where(s1.col("target.dnetIdentifier").isNull()) - .select(s1.col("target.identifier")) - .distinct() - .map( - (MapFunction) - f -> { - final String pid = ((Row) f.getList(0).get(0)).getString(0); - ScholixResource publication = - new CrossrefClient(indexHost).getResourceByDOI(pid); - if (publication != null) { - return publication; - } - ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); - if (dataset != null) { - return dataset; - } - ScholixResource r = new ScholixResource(); - r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); - r.setObjectType("unknown"); - r.setDnetIdentifier( - "70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); + s1 + .where(s1.col("target.dnetIdentifier").isNull()) + .select(s1.col("target.identifier")) + .distinct() + .map( + (MapFunction) f -> { + final String pid = ((Row) f.getList(0).get(0)).getString(0); + ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid); + if (publication != null) { + return publication; + } + ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); + if (dataset != null) { + return dataset; + } + ScholixResource r = new ScholixResource(); + r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); + r.setObjectType("unknown"); + r + .setDnetIdentifier( + "70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); - return r; - }, - Encoders.bean(ScholixResource.class)) - .write() - .mode(SaveMode.Overwrite) - .save(workingDirPath + "/stepB"); + return r; + }, + Encoders.bean(ScholixResource.class)) + .write() + .mode(SaveMode.Overwrite) + .save(workingDirPath + "/stepB"); - Dataset s2 = - spark.read().load(workingDirPath + "/stepB").as(Encoders.bean(ScholixResource.class)); + Dataset s2 = spark + .read() + .load(workingDirPath + "/stepB") + .as(Encoders.bean(ScholixResource.class)); - s1.joinWith( - s2, - s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), - "left") - .flatMap( - (FlatMapFunction, Scholix>) - f -> { - final List res = new ArrayList<>(); - final Scholix s = f._1(); - final ScholixResource target = f._2(); - if (StringUtils.isNotBlank(s.getIdentifier())) res.add(s); - else if (target == null) { - ScholixResource currentTarget = s.getTarget(); - currentTarget.setObjectType("unknown"); - currentTarget.setDnetIdentifier( - Datacite2Scholix.generateId( - currentTarget.getIdentifier().get(0).getIdentifier(), - currentTarget.getIdentifier().get(0).getSchema(), - currentTarget.getObjectType())); + s1 + .joinWith( + s2, + s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), + "left") + .flatMap( + (FlatMapFunction, Scholix>) f -> { + final List res = new ArrayList<>(); + final Scholix s = f._1(); + final ScholixResource target = f._2(); + if (StringUtils.isNotBlank(s.getIdentifier())) + res.add(s); + else if (target == null) { + ScholixResource currentTarget = s.getTarget(); + currentTarget.setObjectType("unknown"); + currentTarget + .setDnetIdentifier( + Datacite2Scholix + .generateId( + currentTarget.getIdentifier().get(0).getIdentifier(), + currentTarget.getIdentifier().get(0).getSchema(), + currentTarget.getObjectType())); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse + .setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); - } else { - target.setIdentifier( - target.getIdentifier().stream() - .map( - d -> - new ScholixIdentifier( - d.getIdentifier().toLowerCase(), - d.getSchema().toLowerCase())) - .collect(Collectors.toList())); - s.setTarget(target); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - } + } else { + target + .setIdentifier( + target + .getIdentifier() + .stream() + .map( + d -> new ScholixIdentifier( + d.getIdentifier().toLowerCase(), + d.getSchema().toLowerCase())) + .collect(Collectors.toList())); + s.setTarget(target); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse + .setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); + } - return res.iterator(); - }, - Encoders.bean(Scholix.class)) - .javaRDD() - .map(s -> new ObjectMapper().writeValueAsString(s)) - .saveAsTextFile(workingDirPath + "/resolved_json"); - } - } + return res.iterator(); + }, + Encoders.bean(Scholix.class)) + .javaRDD() + .map(s -> new ObjectMapper().writeValueAsString(s)) + .saveAsTextFile(workingDirPath + "/resolved_json"); + } + } - private static SparkSession getSession(SparkConf conf, String master) { - return SparkSession.builder() - .config(conf) - .appName(SparkResolveScholixTarget.class.getSimpleName()) - .master(master) - .getOrCreate(); - } + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession + .builder() + .config(conf) + .appName(SparkResolveScholixTarget.class.getSimpleName()) + .master(master) + .getOrCreate(); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java index 7dba9c95ec..d9cbd22f3f 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java @@ -1,46 +1,50 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.provision.update.*; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.util.List; + import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import eu.dnetlib.dhp.provision.update.*; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class DataciteClientTest { - @Test - public void dataciteSCholixTest() throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json")); - final RelationMapper mapper = RelationMapper.load(); + @Test + public void dataciteSCholixTest() throws Exception { + final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json")); + final RelationMapper mapper = RelationMapper.load(); - Datacite2Scholix ds = new Datacite2Scholix(mapper); - final List s = ds.generateScholixFromJson(json); - System.out.println(new ObjectMapper().writeValueAsString(s)); - } + Datacite2Scholix ds = new Datacite2Scholix(mapper); + final List s = ds.generateScholixFromJson(json); + System.out.println(new ObjectMapper().writeValueAsString(s)); + } - // public void testS() throws Exception { - // RetrieveUpdateFromDatacite.main(new String[]{ - // "-n", "file:///data/new_s2.txt", - // "-t", "/data/new_s2.txt", - // "-ts", "1586974078", - // "-ih", "ip-90-147-167-25.ct1.garrservices.it", - // "-in", "datacite", - // }); - // - // } + // public void testS() throws Exception { + // RetrieveUpdateFromDatacite.main(new String[]{ + // "-n", "file:///data/new_s2.txt", + // "-t", "/data/new_s2.txt", + // "-ts", "1586974078", + // "-ih", "ip-90-147-167-25.ct1.garrservices.it", + // "-in", "datacite", + // }); + // + // } - public void testResolveDataset() throws Exception { - DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5"); - Assertions.assertNotNull(datasetByDOI); - System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); + public void testResolveDataset() throws Exception { + DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); + ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5"); + Assertions.assertNotNull(datasetByDOI); + System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); - CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); - Assertions.assertNotNull(crossrefByDOI); - System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); - } + CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); + ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); + Assertions.assertNotNull(crossrefByDOI); + System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java index cc6e999ae7..be97072b57 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java @@ -1,27 +1,30 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class ExtractInfoTest { - @Test - public void testSerialization() throws Exception { + @Test + public void testSerialization() throws Exception { - ScholixSummary summary = new ScholixSummary(); - summary.setDescription("descrizione"); - ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(summary); - System.out.println(json); - System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); - } + ScholixSummary summary = new ScholixSummary(); + summary.setDescription("descrizione"); + ObjectMapper mapper = new ObjectMapper(); + String json = mapper.writeValueAsString(summary); + System.out.println(json); + System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); + } - @Test - public void testScholix() throws Exception { - final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); - final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); - Scholix.generateScholixWithSource(jsonSummary, jsonRelation); - } + @Test + public void testScholix() throws Exception { + final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); + final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); + Scholix.generateScholixWithSource(jsonSummary, jsonRelation); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java index 07b11010b5..99247b7562 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java @@ -1,16 +1,12 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; -import eu.dnetlib.dhp.oa.provision.model.Tuple2; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.util.ArrayList; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -21,103 +17,108 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.model.Tuple2; +import eu.dnetlib.dhp.schema.common.ModelSupport; + /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class AdjacencyListBuilderJob { - private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class); + private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class); - public static final int MAX_LINKS = 100; + public static final int MAX_LINKS = 100; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - AdjacencyListBuilderJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + AdjacencyListBuilderJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - createAdjacencyLists(spark, inputPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + createAdjacencyLists(spark, inputPath, outputPath); + }); + } - private static void createAdjacencyLists( - SparkSession spark, String inputPath, String outputPath) { + private static void createAdjacencyLists( + SparkSession spark, String inputPath, String outputPath) { - log.info("Reading joined entities from: {}", inputPath); - spark - .read() - .load(inputPath) - .as(Encoders.bean(EntityRelEntity.class)) - .groupByKey( - (MapFunction) value -> value.getEntity().getId(), - Encoders.STRING()) - .mapGroups( - (MapGroupsFunction) - (key, values) -> { - JoinedEntity j = new JoinedEntity(); - List links = new ArrayList<>(); - while (values.hasNext() && links.size() < MAX_LINKS) { - EntityRelEntity curr = values.next(); - if (j.getEntity() == null) { - j.setEntity(curr.getEntity()); - } - links.add(new Tuple2(curr.getRelation(), curr.getTarget())); - } - j.setLinks(links); - return j; - }, - Encoders.bean(JoinedEntity.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + log.info("Reading joined entities from: {}", inputPath); + spark + .read() + .load(inputPath) + .as(Encoders.bean(EntityRelEntity.class)) + .groupByKey( + (MapFunction) value -> value.getEntity().getId(), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (key, values) -> { + JoinedEntity j = new JoinedEntity(); + List links = new ArrayList<>(); + while (values.hasNext() && links.size() < MAX_LINKS) { + EntityRelEntity curr = values.next(); + if (j.getEntity() == null) { + j.setEntity(curr.getEntity()); + } + links.add(new Tuple2(curr.getRelation(), curr.getTarget())); + } + j.setLinks(links); + return j; + }, + Encoders.bean(JoinedEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index a9c97155c5..606fa4cc0c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -1,21 +1,14 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -25,224 +18,228 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase1 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); + private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String inputEntityPath = parser.get("inputEntityPath"); - log.info("inputEntityPath: {}", inputEntityPath); + String inputEntityPath = parser.get("inputEntityPath"); + log.info("inputEntityPath: {}", inputEntityPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class entityClazz = - (Class) Class.forName(graphTableClassName); + Class entityClazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); + }); + } - private static void joinRelationEntity( - SparkSession spark, - String inputRelationsPath, - String inputEntityPath, - Class clazz, - String outputPath) { + private static void joinRelationEntity( + SparkSession spark, + String inputRelationsPath, + String inputEntityPath, + Class clazz, + String outputPath) { - Dataset> relsByTarget = - readPathRelation(spark, inputRelationsPath) - .filter("dataInfo.deletedbyinference == false") - .map( - (MapFunction>) - r -> new Tuple2<>(r.getTarget(), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) - .cache(); + Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) + .filter("dataInfo.deletedbyinference == false") + .map( + (MapFunction>) r -> new Tuple2<>(r.getTarget(), r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) + .cache(); - Dataset> entities = - readPathEntity(spark, inputEntityPath, clazz) - .filter("dataInfo.invisible == false") - .map( - (MapFunction) value -> asRelatedEntity(value, clazz), - Encoders.bean(RelatedEntity.class)) - .map( - (MapFunction>) - e -> new Tuple2<>(e.getId(), e), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) - .cache(); + Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) + .filter("dataInfo.invisible == false") + .map( + (MapFunction) value -> asRelatedEntity(value, clazz), + Encoders.bean(RelatedEntity.class)) + .map( + (MapFunction>) e -> new Tuple2<>(e.getId(), e), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) + .cache(); - relsByTarget - .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") - .map( - (MapFunction< - Tuple2, Tuple2>, - EntityRelEntity>) - t -> new EntityRelEntity(t._1()._2(), t._2()._2()), - Encoders.bean(EntityRelEntity.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath + "/" + EntityType.fromClass(clazz)); - } + relsByTarget + .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") + .map( + (MapFunction, Tuple2>, EntityRelEntity>) t -> new EntityRelEntity( + t._1()._2(), t._2()._2()), + Encoders.bean(EntityRelEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath + "/" + EntityType.fromClass(clazz)); + } - private static Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { - log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() - .textFile(inputEntityPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)); - } + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)); + } - public static RelatedEntity asRelatedEntity(E entity, Class clazz) { + public static RelatedEntity asRelatedEntity(E entity, Class clazz) { - final RelatedEntity re = new RelatedEntity(); - re.setId(entity.getId()); - re.setType(EntityType.fromClass(clazz).name()); + final RelatedEntity re = new RelatedEntity(); + re.setId(entity.getId()); + re.setType(EntityType.fromClass(clazz).name()); - re.setPid(entity.getPid()); - re.setCollectedfrom(entity.getCollectedfrom()); + re.setPid(entity.getPid()); + re.setCollectedfrom(entity.getCollectedfrom()); - switch (EntityType.fromClass(clazz)) { - case publication: - case dataset: - case otherresearchproduct: - case software: - Result result = (Result) entity; + switch (EntityType.fromClass(clazz)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + Result result = (Result) entity; - if (result.getTitle() != null && !result.getTitle().isEmpty()) { - re.setTitle(result.getTitle().stream().findFirst().get()); - } + if (result.getTitle() != null && !result.getTitle().isEmpty()) { + re.setTitle(result.getTitle().stream().findFirst().get()); + } - re.setDateofacceptance(getValue(result.getDateofacceptance())); - re.setPublisher(getValue(result.getPublisher())); - re.setResulttype(result.getResulttype()); - re.setInstances(result.getInstance()); + re.setDateofacceptance(getValue(result.getDateofacceptance())); + re.setPublisher(getValue(result.getPublisher())); + re.setResulttype(result.getResulttype()); + re.setInstances(result.getInstance()); - // TODO still to be mapped - // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + // TODO still to be mapped + // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); - break; - case datasource: - Datasource d = (Datasource) entity; + break; + case datasource: + Datasource d = (Datasource) entity; - re.setOfficialname(getValue(d.getOfficialname())); - re.setWebsiteurl(getValue(d.getWebsiteurl())); - re.setDatasourcetype(d.getDatasourcetype()); - re.setOpenairecompatibility(d.getOpenairecompatibility()); + re.setOfficialname(getValue(d.getOfficialname())); + re.setWebsiteurl(getValue(d.getWebsiteurl())); + re.setDatasourcetype(d.getDatasourcetype()); + re.setOpenairecompatibility(d.getOpenairecompatibility()); - break; - case organization: - Organization o = (Organization) entity; + break; + case organization: + Organization o = (Organization) entity; - re.setLegalname(getValue(o.getLegalname())); - re.setLegalshortname(getValue(o.getLegalshortname())); - re.setCountry(o.getCountry()); - re.setWebsiteurl(getValue(o.getWebsiteurl())); - break; - case project: - Project p = (Project) entity; + re.setLegalname(getValue(o.getLegalname())); + re.setLegalshortname(getValue(o.getLegalshortname())); + re.setCountry(o.getCountry()); + re.setWebsiteurl(getValue(o.getWebsiteurl())); + break; + case project: + Project p = (Project) entity; - re.setProjectTitle(getValue(p.getTitle())); - re.setCode(getValue(p.getCode())); - re.setAcronym(getValue(p.getAcronym())); - re.setContracttype(p.getContracttype()); + re.setProjectTitle(getValue(p.getTitle())); + re.setCode(getValue(p.getCode())); + re.setAcronym(getValue(p.getAcronym())); + re.setContracttype(p.getContracttype()); - List> f = p.getFundingtree(); - if (!f.isEmpty()) { - re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList())); - } - break; - } - return re; - } + List> f = p.getFundingtree(); + if (!f.isEmpty()) { + re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList())); + } + break; + } + return re; + } - private static String getValue(Field field) { - return getFieldValueWithDefault(field, ""); - } + private static String getValue(Field field) { + return getFieldValueWithDefault(field, ""); + } - private static T getFieldValueWithDefault(Field f, T defaultValue) { - return Optional.ofNullable(f) - .filter(Objects::nonNull) - .map(x -> x.getValue()) - .orElse(defaultValue); - } + private static T getFieldValueWithDefault(Field f, T defaultValue) { + return Optional + .ofNullable(f) + .filter(Objects::nonNull) + .map(x -> x.getValue()) + .orElse(defaultValue); + } - /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline - * delimited json text file, - * - * @param spark - * @param relationPath - * @return the Dataset containing all the relationships - */ - private static Dataset readPathRelation( - SparkSession spark, final String relationPath) { + /** + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param relationPath + * @return the Dataset containing all the relationships + */ + private static Dataset readPathRelation( + SparkSession spark, final String relationPath) { - log.info("Reading relations from: {}", relationPath); - return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); - } + log.info("Reading relations from: {}", relationPath); + return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 021ef86ba5..403817019e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -1,17 +1,11 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.TypedRow; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -23,203 +17,200 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.TypedRow; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase2 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); + private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath"); - log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath); + String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath"); + log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath); - String inputGraphRootPath = parser.get("inputGraphRootPath"); - log.info("inputGraphRootPath: {}", inputGraphRootPath); + String inputGraphRootPath = parser.get("inputGraphRootPath"); + log.info("inputGraphRootPath: {}", inputGraphRootPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - int numPartitions = Integer.parseInt(parser.get("numPartitions")); - log.info("numPartitions: {}", numPartitions); + int numPartitions = Integer.parseInt(parser.get("numPartitions")); + log.info("numPartitions: {}", numPartitions); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - joinAllEntities( - spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + joinAllEntities( + spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions); + }); + } - private static void joinAllEntities( - SparkSession spark, - String inputRelatedEntitiesPath, - String inputGraphRootPath, - String outputPath, - int numPartitions) { + private static void joinAllEntities( + SparkSession spark, + String inputRelatedEntitiesPath, + String inputGraphRootPath, + String outputPath, + int numPartitions) { - Dataset> entities = - readAllEntities(spark, inputGraphRootPath, numPartitions); - Dataset> relsBySource = - readRelatedEntities(spark, inputRelatedEntitiesPath); + Dataset> entities = readAllEntities(spark, inputGraphRootPath, numPartitions); + Dataset> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath); - entities - .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer") - .map( - (MapFunction< - Tuple2, Tuple2>, - EntityRelEntity>) - value -> { - EntityRelEntity re = new EntityRelEntity(); - re.setEntity(value._1()._2()); - Optional related = - Optional.ofNullable(value._2()).map(Tuple2::_2); - if (related.isPresent()) { - re.setRelation(related.get().getRelation()); - re.setTarget(related.get().getTarget()); - } - return re; - }, - Encoders.bean(EntityRelEntity.class)) - .repartition(numPartitions) - .filter( - (FilterFunction) - value -> - value.getEntity() != null && StringUtils.isNotBlank(value.getEntity().getId())) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + entities + .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer") + .map( + (MapFunction, Tuple2>, EntityRelEntity>) value -> { + EntityRelEntity re = new EntityRelEntity(); + re.setEntity(value._1()._2()); + Optional related = Optional.ofNullable(value._2()).map(Tuple2::_2); + if (related.isPresent()) { + re.setRelation(related.get().getRelation()); + re.setTarget(related.get().getTarget()); + } + return re; + }, + Encoders.bean(EntityRelEntity.class)) + .repartition(numPartitions) + .filter( + (FilterFunction) value -> value.getEntity() != null + && StringUtils.isNotBlank(value.getEntity().getId())) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static Dataset> readAllEntities( - SparkSession spark, String inputGraphPath, int numPartitions) { - Dataset publication = - readPathEntity(spark, inputGraphPath + "/publication", Publication.class); - Dataset dataset = - readPathEntity(spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - Dataset other = - readPathEntity(spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class); - Dataset software = - readPathEntity(spark, inputGraphPath + "/software", Software.class); - Dataset datasource = - readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); - Dataset organization = - readPathEntity(spark, inputGraphPath + "/organization", Organization.class); - Dataset project = readPathEntity(spark, inputGraphPath + "/project", Project.class); + private static Dataset> readAllEntities( + SparkSession spark, String inputGraphPath, int numPartitions) { + Dataset publication = readPathEntity(spark, inputGraphPath + "/publication", Publication.class); + Dataset dataset = readPathEntity( + spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + Dataset other = readPathEntity( + spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class); + Dataset software = readPathEntity(spark, inputGraphPath + "/software", Software.class); + Dataset datasource = readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); + Dataset organization = readPathEntity(spark, inputGraphPath + "/organization", Organization.class); + Dataset project = readPathEntity(spark, inputGraphPath + "/project", Project.class); - return publication - .union(dataset) - .union(other) - .union(software) - .union(datasource) - .union(organization) - .union(project) - .map( - (MapFunction>) - value -> new Tuple2<>(value.getId(), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class))) - .repartition(numPartitions); - } + return publication + .union(dataset) + .union(other) + .union(software) + .union(datasource) + .union(organization) + .union(project) + .map( + (MapFunction>) value -> new Tuple2<>(value.getId(), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class))) + .repartition(numPartitions); + } - private static Dataset> readRelatedEntities( - SparkSession spark, String inputRelatedEntitiesPath) { + private static Dataset> readRelatedEntities( + SparkSession spark, String inputRelatedEntitiesPath) { - log.info("Reading related entities from: {}", inputRelatedEntitiesPath); + log.info("Reading related entities from: {}", inputRelatedEntitiesPath); - final List paths = - HdfsSupport.listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); + final List paths = HdfsSupport + .listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); - log.info("Found paths: {}", String.join(",", paths)); + log.info("Found paths: {}", String.join(",", paths)); - return spark - .read() - .load(toSeq(paths)) - .as(Encoders.bean(EntityRelEntity.class)) - .map( - (MapFunction>) - value -> new Tuple2<>(value.getRelation().getSource(), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class))); - } + return spark + .read() + .load(toSeq(paths)) + .as(Encoders.bean(EntityRelEntity.class)) + .map( + (MapFunction>) value -> new Tuple2<>( + value.getRelation().getSource(), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class))); + } - private static Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { - log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() - .textFile(inputEntityPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)) - .filter("dataInfo.invisible == false") - .map( - (MapFunction) - value -> getTypedRow(StringUtils.substringAfterLast(inputEntityPath, "/"), value), - Encoders.bean(TypedRow.class)); - } + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)) + .filter("dataInfo.invisible == false") + .map( + (MapFunction) value -> getTypedRow( + StringUtils.substringAfterLast(inputEntityPath, "/"), value), + Encoders.bean(TypedRow.class)); + } - private static TypedRow getTypedRow(String type, OafEntity entity) - throws JsonProcessingException { - TypedRow t = new TypedRow(); - t.setType(type); - t.setDeleted(entity.getDataInfo().getDeletedbyinference()); - t.setId(entity.getId()); - t.setOaf(OBJECT_MAPPER.writeValueAsString(entity)); - return t; - } + private static TypedRow getTypedRow(String type, OafEntity entity) + throws JsonProcessingException { + TypedRow t = new TypedRow(); + t.setType(type); + t.setDeleted(entity.getDataInfo().getDeletedbyinference()); + t.setId(entity.getId()); + t.setOaf(OBJECT_MAPPER.writeValueAsString(entity)); + return t; + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static Seq toSeq(List list) { - return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); - } + private static Seq toSeq(List list) { + return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 72eb15cbb6..dbdc54fc04 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,15 +1,10 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; -import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -22,139 +17,144 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class PrepareRelationsJob { - private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final int MAX_RELS = 100; + public static final int MAX_RELS = 100; - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); + }); + } - private static void prepareRelationsFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath) { - readPathRelation(spark, inputRelationsPath) - .filter("dataInfo.deletedbyinference == false") - .groupByKey( - (MapFunction) value -> value.getSource(), Encoders.STRING()) - .flatMapGroups( - (FlatMapGroupsFunction) - (key, values) -> Iterators.limit(values, MAX_RELS), - Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + private static void prepareRelationsFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath) { + readPathRelation(spark, inputRelationsPath) + .filter("dataInfo.deletedbyinference == false") + .groupByKey( + (MapFunction) value -> value.getSource(), Encoders.STRING()) + .flatMapGroups( + (FlatMapGroupsFunction) (key, values) -> Iterators + .limit(values, MAX_RELS), + Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline - * delimited json text file, - * - * @param spark - * @param inputPath - * @return the Dataset containing all the relationships - */ - private static Dataset readPathRelation( - SparkSession spark, final String inputPath) { - return spark - .read() - .textFile(inputPath) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), - Encoders.bean(SortableRelation.class)); - } + /** + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param inputPath + * @return the Dataset containing all the relationships + */ + private static Dataset readPathRelation( + SparkSession spark, final String inputPath) { + return spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), + Encoders.bean(SortableRelation.class)); + } - // TODO work in progress - private static void prepareRelationsRDDFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { - JavaRDD rels = - readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); + // TODO work in progress + private static void prepareRelationsRDDFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); - RDD d = - rels.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only - // consider - // those - // that are not virtually - // deleted - .mapToPair( - (PairFunction) - rel -> new Tuple2<>(rel, rel)) - .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(p -> Iterables.limit(p._2(), MAX_RELS)) - .flatMap(p -> p.iterator()) - .rdd(); + RDD d = rels + .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only + // consider + // those + // that are not virtually + // deleted + .mapToPair( + (PairFunction) rel -> new Tuple2<>(rel, rel)) + .groupByKey(new RelationPartitioner(rels.getNumPartitions())) + .map(p -> Iterables.limit(p._2(), MAX_RELS)) + .flatMap(p -> p.iterator()) + .rdd(); - spark - .createDataset(d, Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + spark + .createDataset(d, Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static JavaRDD readPathRelationRDD( - SparkSession spark, final String inputPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); - } + private static JavaRDD readPathRelationRDD( + SparkSession spark, final String inputPath) { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index aabeae5eeb..a88b28592e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -1,19 +1,13 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.*; -import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; -import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.ArrayList; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; @@ -27,178 +21,205 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.*; +import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; +import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class XmlConverterJob { - private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); + private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; + public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - XmlConverterJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + XmlConverterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - String otherDsTypeId = parser.get("otherDsTypeId"); - log.info("otherDsTypeId: {}", otherDsTypeId); + String otherDsTypeId = parser.get("otherDsTypeId"); + log.info("otherDsTypeId: {}", otherDsTypeId); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - convertToXml( - spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + convertToXml( + spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId); + }); + } - private static void convertToXml( - SparkSession spark, - String inputPath, - String outputPath, - ContextMapper contextMapper, - String otherDsTypeId) { + private static void convertToXml( + SparkSession spark, + String inputPath, + String outputPath, + ContextMapper contextMapper, + String otherDsTypeId) { - final XmlRecordFactory recordFactory = - new XmlRecordFactory( - prepareAccumulators(spark.sparkContext()), - contextMapper, - false, - schemaLocation, - otherDsTypeId); + final XmlRecordFactory recordFactory = new XmlRecordFactory( + prepareAccumulators(spark.sparkContext()), + contextMapper, + false, + schemaLocation, + otherDsTypeId); - spark - .read() - .load(inputPath) - .as(Encoders.bean(JoinedEntity.class)) - .map( - (MapFunction) - j -> { - if (j.getLinks() != null) { - j.setLinks( - j.getLinks().stream() - .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null) - .collect(Collectors.toCollection(ArrayList::new))); - } - return j; - }, - Encoders.bean(JoinedEntity.class)) - .map( - (MapFunction>) - je -> new Tuple2<>(je.getEntity().getId(), recordFactory.build(je)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .javaRDD() - .mapToPair( - (PairFunction, Text, Text>) - t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) - .saveAsHadoopFile( - outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - } + spark + .read() + .load(inputPath) + .as(Encoders.bean(JoinedEntity.class)) + .map( + (MapFunction) j -> { + if (j.getLinks() != null) { + j + .setLinks( + j + .getLinks() + .stream() + .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null) + .collect(Collectors.toCollection(ArrayList::new))); + } + return j; + }, + Encoders.bean(JoinedEntity.class)) + .map( + (MapFunction>) je -> new Tuple2<>(je.getEntity().getId(), + recordFactory.build(je)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .javaRDD() + .mapToPair( + (PairFunction, Text, Text>) t -> new Tuple2<>(new Text(t._1()), + new Text(t._2()))) + .saveAsHadoopFile( + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static Map prepareAccumulators(SparkContext sc) { - Map accumulators = Maps.newHashMap(); - accumulators.put( - "resultResult_similarity_isAmongTopNSimilarDocuments", - sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); - accumulators.put( - "resultResult_similarity_hasAmongTopNSimilarDocuments", - sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); - accumulators.put( - "resultResult_supplement_isSupplementTo", - sc.longAccumulator("resultResult_supplement_isSupplementTo")); - accumulators.put( - "resultResult_supplement_isSupplementedBy", - sc.longAccumulator("resultResult_supplement_isSupplementedBy")); - accumulators.put( - "resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); - accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); + private static Map prepareAccumulators(SparkContext sc) { + Map accumulators = Maps.newHashMap(); + accumulators + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); + accumulators + .put( + "resultResult_similarity_hasAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); + accumulators + .put( + "resultResult_supplement_isSupplementTo", + sc.longAccumulator("resultResult_supplement_isSupplementTo")); + accumulators + .put( + "resultResult_supplement_isSupplementedBy", + sc.longAccumulator("resultResult_supplement_isSupplementedBy")); + accumulators + .put( + "resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); + accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); - accumulators.put( - "resultResult_publicationDataset_isRelatedTo", - sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); - accumulators.put( - "resultResult_relationship_isRelatedTo", - sc.longAccumulator("resultResult_relationship_isRelatedTo")); - accumulators.put( - "resultProject_outcome_isProducedBy", - sc.longAccumulator("resultProject_outcome_isProducedBy")); - accumulators.put( - "resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put( - "resultOrganization_affiliation_isAuthorInstitutionOf", - sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); + accumulators + .put( + "resultResult_publicationDataset_isRelatedTo", + sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); + accumulators + .put( + "resultResult_relationship_isRelatedTo", + sc.longAccumulator("resultResult_relationship_isRelatedTo")); + accumulators + .put( + "resultProject_outcome_isProducedBy", + sc.longAccumulator("resultProject_outcome_isProducedBy")); + accumulators + .put( + "resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); + accumulators + .put( + "resultOrganization_affiliation_isAuthorInstitutionOf", + sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); - accumulators.put( - "resultOrganization_affiliation_hasAuthorInstitution", - sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); - accumulators.put( - "projectOrganization_participation_hasParticipant", - sc.longAccumulator("projectOrganization_participation_hasParticipant")); - accumulators.put( - "projectOrganization_participation_isParticipant", - sc.longAccumulator("projectOrganization_participation_isParticipant")); - accumulators.put( - "organizationOrganization_dedup_isMergedIn", - sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); - accumulators.put( - "organizationOrganization_dedup_merges", - sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put( - "datasourceOrganization_provision_isProvidedBy", - sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); - accumulators.put( - "datasourceOrganization_provision_provides", - sc.longAccumulator("datasourceOrganization_provision_provides")); + accumulators + .put( + "resultOrganization_affiliation_hasAuthorInstitution", + sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); + accumulators + .put( + "projectOrganization_participation_hasParticipant", + sc.longAccumulator("projectOrganization_participation_hasParticipant")); + accumulators + .put( + "projectOrganization_participation_isParticipant", + sc.longAccumulator("projectOrganization_participation_isParticipant")); + accumulators + .put( + "organizationOrganization_dedup_isMergedIn", + sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); + accumulators + .put( + "organizationOrganization_dedup_merges", + sc.longAccumulator("resultProject_outcome_produces")); + accumulators + .put( + "datasourceOrganization_provision_isProvidedBy", + sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); + accumulators + .put( + "datasourceOrganization_provision_provides", + sc.longAccumulator("datasourceOrganization_provision_provides")); - return accumulators; - } + return accumulators; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index ca81e0b3f0..b9746f153b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -1,25 +1,20 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.lucidworks.spark.util.SolrSupport; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Optional; + import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; @@ -30,197 +25,206 @@ import org.apache.spark.rdd.RDD; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.lucidworks.spark.util.SolrSupport; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class XmlIndexingJob { - private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); + private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); - private static final Integer DEFAULT_BATCH_SIZE = 1000; + private static final Integer DEFAULT_BATCH_SIZE = 1000; - private static final String LAYOUT = "index"; - private static final String INTERPRETATION = "openaire"; - private static final String SEPARATOR = "-"; - public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'"; + private static final String LAYOUT = "index"; + private static final String INTERPRETATION = "openaire"; + private static final String SEPARATOR = "-"; + public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - XmlIndexingJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + XmlIndexingJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - final String format = parser.get("format"); - log.info("format: {}", format); + final String format = parser.get("format"); + log.info("format: {}", format); - final Integer batchSize = - parser.getObjectMap().containsKey("batchSize") - ? Integer.valueOf(parser.get("batchSize")) - : DEFAULT_BATCH_SIZE; - log.info("batchSize: {}", batchSize); + final Integer batchSize = parser.getObjectMap().containsKey("batchSize") + ? Integer.valueOf(parser.get("batchSize")) + : DEFAULT_BATCH_SIZE; + log.info("batchSize: {}", batchSize); - final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); - final String fields = getLayoutSource(isLookup, format); - log.info("fields: {}", fields); + final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + final String fields = getLayoutSource(isLookup, format); + log.info("fields: {}", fields); - final String xslt = getLayoutTransformer(isLookup); + final String xslt = getLayoutTransformer(isLookup); - final String dsId = getDsId(format, isLookup); - log.info("dsId: {}", dsId); + final String dsId = getDsId(format, isLookup); + log.info("dsId: {}", dsId); - final String zkHost = getZkHost(isLookup); - log.info("zkHost: {}", zkHost); + final String zkHost = getZkHost(isLookup); + log.info("zkHost: {}", zkHost); - final String version = getRecordDatestamp(); + final String version = getRecordDatestamp(); - final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); - log.info("indexRecordTransformer {}", indexRecordXslt); + final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); + log.info("indexRecordTransformer {}", indexRecordXslt); - final SparkConf conf = new SparkConf(); + final SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - RDD docs = - sc.sequenceFile(inputPath, Text.class, Text.class) - .map(t -> t._2().toString()) - .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) - .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) - .rdd(); + RDD docs = sc + .sequenceFile(inputPath, Text.class, Text.class) + .map(t -> t._2().toString()) + .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) + .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) + .rdd(); - final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; - SolrSupport.indexDocs(zkHost, collection, batchSize, docs); - }); - } + final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; + SolrSupport.indexDocs(zkHost, collection, batchSize, docs); + }); + } - private static String toIndexRecord(Transformer tr, final String record) { - final StreamResult res = new StreamResult(new StringWriter()); - try { - tr.transform(new StreamSource(new StringReader(record)), res); - return res.getWriter().toString(); - } catch (Throwable e) { - log.error("XPathException on record: \n {}", record, e); - throw new IllegalArgumentException(e); - } - } + private static String toIndexRecord(Transformer tr, final String record) { + final StreamResult res = new StreamResult(new StringWriter()); + try { + tr.transform(new StreamSource(new StringReader(record)), res); + return res.getWriter().toString(); + } catch (Throwable e) { + log.error("XPathException on record: \n {}", record, e); + throw new IllegalArgumentException(e); + } + } - /** - * Creates the XSLT responsible for building the index xml records. - * - * @param format Metadata format name (DMF|TMF) - * @param xslt xslt for building the index record transformer - * @param fields the list of fields - * @return the javax.xml.transform.Transformer - * @throws ISLookUpException could happen - * @throws IOException could happen - * @throws TransformerException could happen - */ - private static String getLayoutTransformer(String format, String fields, String xslt) - throws TransformerException { + /** + * Creates the XSLT responsible for building the index xml records. + * + * @param format Metadata format name (DMF|TMF) + * @param xslt xslt for building the index record transformer + * @param fields the list of fields + * @return the javax.xml.transform.Transformer + * @throws ISLookUpException could happen + * @throws IOException could happen + * @throws TransformerException could happen + */ + private static String getLayoutTransformer(String format, String fields, String xslt) + throws TransformerException { - final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); - final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); + final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); + final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); - layoutTransformer.setParameter("format", format); - layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); + layoutTransformer.setParameter("format", format); + layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); - return layoutToXsltXslt.getWriter().toString(); - } + return layoutToXsltXslt.getWriter().toString(); + } - /** - * method return a solr-compatible string representation of a date, used to mark all records as - * indexed today - * - * @return the parsed date - */ - public static String getRecordDatestamp() { - return new SimpleDateFormat(DATE_FORMAT).format(new Date()); - } + /** + * method return a solr-compatible string representation of a date, used to mark all records as indexed today + * + * @return the parsed date + */ + public static String getRecordDatestamp() { + return new SimpleDateFormat(DATE_FORMAT).format(new Date()); + } - /** - * Method retrieves from the information system the list of fields associated to the given - * MDFormat name - * - * @param isLookup the ISLookup service stub - * @param format the Metadata format name - * @return the string representation of the list of fields to be indexed - * @throws ISLookUpDocumentNotFoundException - * @throws ISLookUpException - */ - private static String getLayoutSource(final ISLookUpService isLookup, final String format) - throws ISLookUpDocumentNotFoundException, ISLookUpException { - return doLookup( - isLookup, - String.format( - "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", - format, LAYOUT)); - } + /** + * Method retrieves from the information system the list of fields associated to the given MDFormat name + * + * @param isLookup the ISLookup service stub + * @param format the Metadata format name + * @return the string representation of the list of fields to be indexed + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutSource(final ISLookUpService isLookup, final String format) + throws ISLookUpDocumentNotFoundException, ISLookUpException { + return doLookup( + isLookup, + String + .format( + "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", + format, LAYOUT)); + } - /** - * Method retrieves from the information system the openaireLayoutToRecordStylesheet - * - * @param isLookup the ISLookup service stub - * @return the string representation of the XSLT contained in the transformation rule profile - * @throws ISLookUpDocumentNotFoundException - * @throws ISLookUpException - */ - private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" - + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); - } + /** + * Method retrieves from the information system the openaireLayoutToRecordStylesheet + * + * @param isLookup the ISLookup service stub + * @return the string representation of the XSLT contained in the transformation rule profile + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); + } - /** - * Method retrieves from the information system the IndexDS profile ID associated to the given - * MDFormat name - * - * @param format - * @param isLookup - * @return the IndexDS identifier - * @throws ISLookUpException - */ - private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - String.format( - "collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" - + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", - format)); - } + /** + * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name + * + * @param format + * @param isLookup + * @return the IndexDS identifier + * @throws ISLookUpException + */ + private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + String + .format( + "collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", + format)); + } - /** - * Method retrieves from the information system the zookeeper quorum of the Solr server - * - * @param isLookup - * @return the zookeeper quorum of the Solr server - * @throws ISLookUpException - */ - private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); - } + /** + * Method retrieves from the information system the zookeeper quorum of the Solr server + * + * @param isLookup + * @return the zookeeper quorum of the Solr server + * @throws ISLookUpException + */ + private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); + } - private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { - log.info(String.format("running xquery: %s", xquery)); - final String res = isLookup.getResourceProfileByQuery(xquery); - log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); - return res; - } + private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { + log.info(String.format("running xquery: %s", xquery)); + final String res = isLookup.getResourceProfileByQuery(xquery); + log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); + return res; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java index e47356c135..a6b3c5591a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java @@ -1,62 +1,67 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.base.Objects; import java.io.Serializable; +import com.google.common.base.Objects; + public class EntityRelEntity implements Serializable { - private TypedRow entity; - private SortableRelation relation; - private RelatedEntity target; + private TypedRow entity; + private SortableRelation relation; + private RelatedEntity target; - public EntityRelEntity() {} + public EntityRelEntity() { + } - public EntityRelEntity(SortableRelation relation, RelatedEntity target) { - this(null, relation, target); - } + public EntityRelEntity(SortableRelation relation, RelatedEntity target) { + this(null, relation, target); + } - public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) { - this.entity = entity; - this.relation = relation; - this.target = target; - } + public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) { + this.entity = entity; + this.relation = relation; + this.target = target; + } - public TypedRow getEntity() { - return entity; - } + public TypedRow getEntity() { + return entity; + } - public void setEntity(TypedRow entity) { - this.entity = entity; - } + public void setEntity(TypedRow entity) { + this.entity = entity; + } - public SortableRelation getRelation() { - return relation; - } + public SortableRelation getRelation() { + return relation; + } - public void setRelation(SortableRelation relation) { - this.relation = relation; - } + public void setRelation(SortableRelation relation) { + this.relation = relation; + } - public RelatedEntity getTarget() { - return target; - } + public RelatedEntity getTarget() { + return target; + } - public void setTarget(RelatedEntity target) { - this.target = target; - } + public void setTarget(RelatedEntity target) { + this.target = target; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - EntityRelEntity that = (EntityRelEntity) o; - return Objects.equal(entity, that.entity) - && Objects.equal(relation, that.relation) - && Objects.equal(target, that.target); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + EntityRelEntity that = (EntityRelEntity) o; + return Objects.equal(entity, that.entity) + && Objects.equal(relation, that.relation) + && Objects.equal(target, that.target); + } - @Override - public int hashCode() { - return Objects.hashCode(entity, relation, target); - } + @Override + public int hashCode() { + return Objects.hashCode(entity, relation, target); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java index daa069255a..e29ec9d192 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.provision.model; import java.io.Serializable; @@ -5,25 +6,26 @@ import java.util.List; public class JoinedEntity implements Serializable { - private TypedRow entity; + private TypedRow entity; - private List links; + private List links; - public JoinedEntity() {} + public JoinedEntity() { + } - public TypedRow getEntity() { - return entity; - } + public TypedRow getEntity() { + return entity; + } - public void setEntity(TypedRow entity) { - this.entity = entity; - } + public void setEntity(TypedRow entity) { + this.entity = entity; + } - public List getLinks() { - return links; - } + public List getLinks() { + return links; + } - public void setLinks(List links) { - this.links = links; - } + public void setLinks(List links) { + this.links = links; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java index 9671d505c9..e15ceff760 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java @@ -1,289 +1,295 @@ + package eu.dnetlib.dhp.oa.provision.model; +import java.io.Serializable; +import java.util.List; + import com.google.common.base.Objects; + import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.Serializable; -import java.util.List; public class RelatedEntity implements Serializable { - private String id; - private String type; + private String id; + private String type; - // common fields - private StructuredProperty title; - private String websiteurl; // datasource, organizations, projects + // common fields + private StructuredProperty title; + private String websiteurl; // datasource, organizations, projects - // results - private String dateofacceptance; - private String publisher; - private List pid; - private String codeRepositoryUrl; - private Qualifier resulttype; - private List collectedfrom; - private List instances; + // results + private String dateofacceptance; + private String publisher; + private List pid; + private String codeRepositoryUrl; + private Qualifier resulttype; + private List collectedfrom; + private List instances; - // datasource - private String officialname; - private Qualifier datasourcetype; - private Qualifier datasourcetypeui; - private Qualifier openairecompatibility; - // private String aggregatortype; + // datasource + private String officialname; + private Qualifier datasourcetype; + private Qualifier datasourcetypeui; + private Qualifier openairecompatibility; + // private String aggregatortype; - // organization - private String legalname; - private String legalshortname; - private Qualifier country; + // organization + private String legalname; + private String legalshortname; + private Qualifier country; - // project - private String projectTitle; - private String code; - private String acronym; - private Qualifier contracttype; - private List fundingtree; + // project + private String projectTitle; + private String code; + private String acronym; + private Qualifier contracttype; + private List fundingtree; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public StructuredProperty getTitle() { - return title; - } + public StructuredProperty getTitle() { + return title; + } - public void setTitle(StructuredProperty title) { - this.title = title; - } + public void setTitle(StructuredProperty title) { + this.title = title; + } - public String getWebsiteurl() { - return websiteurl; - } + public String getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(String websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(String websiteurl) { + this.websiteurl = websiteurl; + } - public String getDateofacceptance() { - return dateofacceptance; - } + public String getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(String dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(String dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public String getPublisher() { - return publisher; - } + public String getPublisher() { + return publisher; + } - public void setPublisher(String publisher) { - this.publisher = publisher; - } + public void setPublisher(String publisher) { + this.publisher = publisher; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getCodeRepositoryUrl() { - return codeRepositoryUrl; - } + public String getCodeRepositoryUrl() { + return codeRepositoryUrl; + } - public void setCodeRepositoryUrl(String codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } + public void setCodeRepositoryUrl(String codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + } - public Qualifier getResulttype() { - return resulttype; - } + public Qualifier getResulttype() { + return resulttype; + } - public void setResulttype(Qualifier resulttype) { - this.resulttype = resulttype; - } + public void setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + } - public List getCollectedfrom() { - return collectedfrom; - } + public List getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + } - public List getInstances() { - return instances; - } + public List getInstances() { + return instances; + } - public void setInstances(List instances) { - this.instances = instances; - } + public void setInstances(List instances) { + this.instances = instances; + } - public String getOfficialname() { - return officialname; - } + public String getOfficialname() { + return officialname; + } - public void setOfficialname(String officialname) { - this.officialname = officialname; - } + public void setOfficialname(String officialname) { + this.officialname = officialname; + } - public Qualifier getDatasourcetype() { - return datasourcetype; - } + public Qualifier getDatasourcetype() { + return datasourcetype; + } - public void setDatasourcetype(Qualifier datasourcetype) { - this.datasourcetype = datasourcetype; - } + public void setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + } - public Qualifier getDatasourcetypeui() { - return datasourcetypeui; - } + public Qualifier getDatasourcetypeui() { + return datasourcetypeui; + } - public void setDatasourcetypeui(Qualifier datasourcetypeui) { - this.datasourcetypeui = datasourcetypeui; - } + public void setDatasourcetypeui(Qualifier datasourcetypeui) { + this.datasourcetypeui = datasourcetypeui; + } - public Qualifier getOpenairecompatibility() { - return openairecompatibility; - } + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } - public void setOpenairecompatibility(Qualifier openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } + public void setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + } - public String getLegalname() { - return legalname; - } + public String getLegalname() { + return legalname; + } - public void setLegalname(String legalname) { - this.legalname = legalname; - } + public void setLegalname(String legalname) { + this.legalname = legalname; + } - public String getLegalshortname() { - return legalshortname; - } + public String getLegalshortname() { + return legalshortname; + } - public void setLegalshortname(String legalshortname) { - this.legalshortname = legalshortname; - } + public void setLegalshortname(String legalshortname) { + this.legalshortname = legalshortname; + } - public Qualifier getCountry() { - return country; - } + public Qualifier getCountry() { + return country; + } - public void setCountry(Qualifier country) { - this.country = country; - } + public void setCountry(Qualifier country) { + this.country = country; + } - public String getProjectTitle() { - return projectTitle; - } + public String getProjectTitle() { + return projectTitle; + } - public void setProjectTitle(String projectTitle) { - this.projectTitle = projectTitle; - } + public void setProjectTitle(String projectTitle) { + this.projectTitle = projectTitle; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public String getAcronym() { - return acronym; - } + public String getAcronym() { + return acronym; + } - public void setAcronym(String acronym) { - this.acronym = acronym; - } + public void setAcronym(String acronym) { + this.acronym = acronym; + } - public Qualifier getContracttype() { - return contracttype; - } + public Qualifier getContracttype() { + return contracttype; + } - public void setContracttype(Qualifier contracttype) { - this.contracttype = contracttype; - } + public void setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + } - public List getFundingtree() { - return fundingtree; - } + public List getFundingtree() { + return fundingtree; + } - public void setFundingtree(List fundingtree) { - this.fundingtree = fundingtree; - } + public void setFundingtree(List fundingtree) { + this.fundingtree = fundingtree; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - RelatedEntity that = (RelatedEntity) o; - return Objects.equal(id, that.id) - && Objects.equal(type, that.type) - && Objects.equal(title, that.title) - && Objects.equal(websiteurl, that.websiteurl) - && Objects.equal(dateofacceptance, that.dateofacceptance) - && Objects.equal(publisher, that.publisher) - && Objects.equal(pid, that.pid) - && Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) - && Objects.equal(resulttype, that.resulttype) - && Objects.equal(collectedfrom, that.collectedfrom) - && Objects.equal(instances, that.instances) - && Objects.equal(officialname, that.officialname) - && Objects.equal(datasourcetype, that.datasourcetype) - && Objects.equal(datasourcetypeui, that.datasourcetypeui) - && Objects.equal(openairecompatibility, that.openairecompatibility) - && Objects.equal(legalname, that.legalname) - && Objects.equal(legalshortname, that.legalshortname) - && Objects.equal(country, that.country) - && Objects.equal(projectTitle, that.projectTitle) - && Objects.equal(code, that.code) - && Objects.equal(acronym, that.acronym) - && Objects.equal(contracttype, that.contracttype) - && Objects.equal(fundingtree, that.fundingtree); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + RelatedEntity that = (RelatedEntity) o; + return Objects.equal(id, that.id) + && Objects.equal(type, that.type) + && Objects.equal(title, that.title) + && Objects.equal(websiteurl, that.websiteurl) + && Objects.equal(dateofacceptance, that.dateofacceptance) + && Objects.equal(publisher, that.publisher) + && Objects.equal(pid, that.pid) + && Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) + && Objects.equal(resulttype, that.resulttype) + && Objects.equal(collectedfrom, that.collectedfrom) + && Objects.equal(instances, that.instances) + && Objects.equal(officialname, that.officialname) + && Objects.equal(datasourcetype, that.datasourcetype) + && Objects.equal(datasourcetypeui, that.datasourcetypeui) + && Objects.equal(openairecompatibility, that.openairecompatibility) + && Objects.equal(legalname, that.legalname) + && Objects.equal(legalshortname, that.legalshortname) + && Objects.equal(country, that.country) + && Objects.equal(projectTitle, that.projectTitle) + && Objects.equal(code, that.code) + && Objects.equal(acronym, that.acronym) + && Objects.equal(contracttype, that.contracttype) + && Objects.equal(fundingtree, that.fundingtree); + } - @Override - public int hashCode() { - return Objects.hashCode( - id, - type, - title, - websiteurl, - dateofacceptance, - publisher, - pid, - codeRepositoryUrl, - resulttype, - collectedfrom, - instances, - officialname, - datasourcetype, - datasourcetypeui, - openairecompatibility, - legalname, - legalshortname, - country, - projectTitle, - code, - acronym, - contracttype, - fundingtree); - } + @Override + public int hashCode() { + return Objects + .hashCode( + id, + type, + title, + websiteurl, + dateofacceptance, + publisher, + pid, + codeRepositoryUrl, + resulttype, + collectedfrom, + instances, + officialname, + datasourcetype, + datasourcetypeui, + openairecompatibility, + legalname, + legalshortname, + country, + projectTitle, + code, + acronym, + contracttype, + fundingtree); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java index 0a35a9752d..7c866001be 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java @@ -1,34 +1,38 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.Serializable; import java.util.Map; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + public class SortableRelation extends Relation implements Comparable, Serializable { - private static final Map weights = Maps.newHashMap(); + private static final Map weights = Maps.newHashMap(); - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("publicationDataset", 2); - weights.put("relationship", 3); - weights.put("similarity", 4); - weights.put("affiliation", 5); + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("publicationDataset", 2); + weights.put("relationship", 3); + weights.put("similarity", 4); + weights.put("affiliation", 5); - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } - @Override - public int compareTo(Relation o) { - return ComparisonChain.start() - .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) - .compare(getSource(), o.getSource()) - .compare(getTarget(), o.getTarget()) - .result(); - } + @Override + public int compareTo(Relation o) { + return ComparisonChain + .start() + .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) + .compare(getSource(), o.getSource()) + .compare(getTarget(), o.getTarget()) + .result(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java index e7e4aea3cf..5ebe9c9eb0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java @@ -1,48 +1,53 @@ + package eu.dnetlib.dhp.oa.provision.model; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.Serializable; import java.util.Objects; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class Tuple2 implements Serializable { - private Relation relation; + private Relation relation; - private RelatedEntity relatedEntity; + private RelatedEntity relatedEntity; - public Tuple2() {} + public Tuple2() { + } - public Tuple2(Relation relation, RelatedEntity relatedEntity) { - this.relation = relation; - this.relatedEntity = relatedEntity; - } + public Tuple2(Relation relation, RelatedEntity relatedEntity) { + this.relation = relation; + this.relatedEntity = relatedEntity; + } - public Relation getRelation() { - return relation; - } + public Relation getRelation() { + return relation; + } - public void setRelation(Relation relation) { - this.relation = relation; - } + public void setRelation(Relation relation) { + this.relation = relation; + } - public RelatedEntity getRelatedEntity() { - return relatedEntity; - } + public RelatedEntity getRelatedEntity() { + return relatedEntity; + } - public void setRelatedEntity(RelatedEntity relatedEntity) { - this.relatedEntity = relatedEntity; - } + public void setRelatedEntity(RelatedEntity relatedEntity) { + this.relatedEntity = relatedEntity; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Tuple2 t2 = (Tuple2) o; - return getRelation().equals(t2.getRelation()); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Tuple2 t2 = (Tuple2) o; + return getRelation().equals(t2.getRelation()); + } - @Override - public int hashCode() { - return Objects.hash(getRelation().hashCode()); - } + @Override + public int hashCode() { + return Objects.hash(getRelation().hashCode()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java index 01067707ef..cbec372e43 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java @@ -1,60 +1,64 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.base.Objects; import java.io.Serializable; +import com.google.common.base.Objects; + public class TypedRow implements Serializable { - private String id; + private String id; - private Boolean deleted; + private Boolean deleted; - private String type; + private String type; - private String oaf; + private String oaf; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public Boolean getDeleted() { - return deleted; - } + public Boolean getDeleted() { + return deleted; + } - public void setDeleted(Boolean deleted) { - this.deleted = deleted; - } + public void setDeleted(Boolean deleted) { + this.deleted = deleted; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public String getOaf() { - return oaf; - } + public String getOaf() { + return oaf; + } - public void setOaf(String oaf) { - this.oaf = oaf; - } + public void setOaf(String oaf) { + this.oaf = oaf; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - TypedRow typedRow2 = (TypedRow) o; - return Objects.equal(id, typedRow2.id); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TypedRow typedRow2 = (TypedRow) o; + return Objects.equal(id, typedRow2.id); + } - @Override - public int hashCode() { - return Objects.hashCode(id); - } + @Override + public int hashCode() { + return Objects.hashCode(id); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java index dc61704458..8afd6400c9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java @@ -1,51 +1,52 @@ + package eu.dnetlib.dhp.oa.provision.utils; import java.io.Serializable; public class ContextDef implements Serializable { - private String id; - private String label; - private String name; - private String type; + private String id; + private String label; + private String name; + private String type; - public ContextDef(final String id, final String label, final String name, final String type) { - super(); - this.setId(id); - this.setLabel(label); - this.setName(name); - this.setType(type); - } + public ContextDef(final String id, final String label, final String name, final String type) { + super(); + this.setId(id); + this.setLabel(label); + this.setName(name); + this.setType(type); + } - public String getLabel() { - return label; - } + public String getLabel() { + return label; + } - public void setLabel(final String label) { - this.label = label; - } + public void setLabel(final String label) { + this.label = label; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(final String name) { - this.name = name; - } + public void setName(final String name) { + this.name = name; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(final String type) { - this.type = type; - } + public void setType(final String type) { + this.type = type; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java index d1d6521db6..ac418f2b9b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java @@ -1,46 +1,49 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.base.Joiner; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.Serializable; import java.io.StringReader; import java.util.HashMap; + import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import com.google.common.base.Joiner; + +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class ContextMapper extends HashMap implements Serializable { - private static final long serialVersionUID = 2159682308502487305L; + private static final long serialVersionUID = 2159682308502487305L; - private static final String XQUERY = - "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; + private static final String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; - public static ContextMapper fromIS(final String isLookupUrl) - throws DocumentException, ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - StringBuilder sb = new StringBuilder(""); - Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); - sb.append(""); - return fromXml(sb.toString()); - } + public static ContextMapper fromIS(final String isLookupUrl) + throws DocumentException, ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + StringBuilder sb = new StringBuilder(""); + Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); + sb.append(""); + return fromXml(sb.toString()); + } - public static ContextMapper fromXml(final String xml) throws DocumentException { - final ContextMapper contextMapper = new ContextMapper(); + public static ContextMapper fromXml(final String xml) throws DocumentException { + final ContextMapper contextMapper = new ContextMapper(); - final Document doc = new SAXReader().read(new StringReader(xml)); - for (Object o : doc.selectNodes("//entry")) { - Node node = (Node) o; - String id = node.valueOf("./@id"); - String label = node.valueOf("./@label"); - String name = node.valueOf("./@name"); - String type = node.valueOf("./@type") + ""; + final Document doc = new SAXReader().read(new StringReader(xml)); + for (Object o : doc.selectNodes("//entry")) { + Node node = (Node) o; + String id = node.valueOf("./@id"); + String label = node.valueOf("./@label"); + String name = node.valueOf("./@name"); + String type = node.valueOf("./@type") + ""; - contextMapper.put(id, new ContextDef(id, label, name, type)); - } - return contextMapper; - } + contextMapper.put(id, new ContextDef(id, label, name, type)); + } + return contextMapper; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java index 96ffb4c900..0e742365ab 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java @@ -1,23 +1,27 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static org.apache.commons.lang3.StringUtils.substringAfter; -import com.google.common.collect.Sets; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Set; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.schema.oaf.*; + public class GraphMappingUtils { - public static final String SEPARATOR = "_"; + public static final String SEPARATOR = "_"; - public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); + public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); - public static String removePrefix(final String s) { - if (s.contains("|")) return substringAfter(s, "|"); - return s; - } + public static String removePrefix(final String s) { + if (s.contains("|")) + return substringAfter(s, "|"); + return s; + } - public static String getRelDescriptor(String relType, String subRelType, String relClass) { - return relType + SEPARATOR + subRelType + SEPARATOR + relClass; - } + public static String getRelDescriptor(String relType, String subRelType, String relClass) { + return relType + SEPARATOR + subRelType + SEPARATOR + relClass; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java index 823997b6d6..9dbac1936c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java @@ -1,47 +1,69 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import eu.dnetlib.dhp.schema.oaf.Qualifier; import java.util.Comparator; +import eu.dnetlib.dhp.schema.oaf.Qualifier; + public class LicenseComparator implements Comparator { - @Override - public int compare(Qualifier left, Qualifier right) { + @Override + public int compare(Qualifier left, Qualifier right) { - if (left == null && right == null) return 0; - if (left == null) return 1; - if (right == null) return -1; + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; - String lClass = left.getClassid(); - String rClass = right.getClassid(); + String lClass = left.getClassid(); + String rClass = right.getClassid(); - if (lClass.equals(rClass)) return 0; + if (lClass.equals(rClass)) + return 0; - if (lClass.equals("OPEN SOURCE")) return -1; - if (rClass.equals("OPEN SOURCE")) return 1; + if (lClass.equals("OPEN SOURCE")) + return -1; + if (rClass.equals("OPEN SOURCE")) + return 1; - if (lClass.equals("OPEN")) return -1; - if (rClass.equals("OPEN")) return 1; + if (lClass.equals("OPEN")) + return -1; + if (rClass.equals("OPEN")) + return 1; - if (lClass.equals("6MONTHS")) return -1; - if (rClass.equals("6MONTHS")) return 1; + if (lClass.equals("6MONTHS")) + return -1; + if (rClass.equals("6MONTHS")) + return 1; - if (lClass.equals("12MONTHS")) return -1; - if (rClass.equals("12MONTHS")) return 1; + if (lClass.equals("12MONTHS")) + return -1; + if (rClass.equals("12MONTHS")) + return 1; - if (lClass.equals("EMBARGO")) return -1; - if (rClass.equals("EMBARGO")) return 1; + if (lClass.equals("EMBARGO")) + return -1; + if (rClass.equals("EMBARGO")) + return 1; - if (lClass.equals("RESTRICTED")) return -1; - if (rClass.equals("RESTRICTED")) return 1; + if (lClass.equals("RESTRICTED")) + return -1; + if (rClass.equals("RESTRICTED")) + return 1; - if (lClass.equals("CLOSED")) return -1; - if (rClass.equals("CLOSED")) return 1; + if (lClass.equals("CLOSED")) + return -1; + if (rClass.equals("CLOSED")) + return 1; - if (lClass.equals("UNKNOWN")) return -1; - if (rClass.equals("UNKNOWN")) return 1; + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index 6db8b12de7..bac2278e6a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -1,29 +1,30 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import org.apache.spark.Partitioner; import org.apache.spark.util.Utils; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; + /** - * Used in combination with SortableRelationKey, allows to partition the records by source id, - * therefore allowing to sort relations sharing the same source id by the ordering defined in - * SortableRelationKey. + * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore allowing to + * sort relations sharing the same source id by the ordering defined in SortableRelationKey. */ public class RelationPartitioner extends Partitioner { - private int numPartitions; + private int numPartitions; - public RelationPartitioner(int numPartitions) { - this.numPartitions = numPartitions; - } + public RelationPartitioner(int numPartitions) { + this.numPartitions = numPartitions; + } - @Override - public int numPartitions() { - return numPartitions; - } + @Override + public int numPartitions() { + return numPartitions; + } - @Override - public int getPartition(Object key) { - return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions()); - } + @Override + public int getPartition(Object key) { + return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java index c472e6e85c..de221b2ee3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java @@ -1,262 +1,260 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.collect.Lists; import java.io.StringReader; import java.io.StringWriter; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; + import javax.xml.stream.*; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; + import org.apache.solr.common.SolrInputDocument; +import com.google.common.collect.Lists; + /** * Optimized version of the document parser, drop in replacement of InputDocumentFactory. - * - *

Faster because: - * + *

+ * Faster because: *

- * - *

This class is fully reentrant and can be invoked in parallel. + *

+ * This class is fully reentrant and can be invoked in parallel. * * @author claudio */ public class StreamingInputDocumentFactory { - private static final String INDEX_FIELD_PREFIX = "__"; + private static final String INDEX_FIELD_PREFIX = "__"; - private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; + private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; - private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; + private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; - private static final String RESULT = "result"; + private static final String RESULT = "result"; - private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; + private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; - private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; + private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; - private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); - private static final List dateFormats = - Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); + private static final List dateFormats = Arrays + .asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); - private static final String DEFAULTDNETRESULT = "dnetResult"; + private static final String DEFAULTDNETRESULT = "dnetResult"; - private static final String TARGETFIELDS = "targetFields"; + private static final String TARGETFIELDS = "targetFields"; - private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; + private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; - private static final String ROOT_ELEMENT = "indexRecord"; + private static final String ROOT_ELEMENT = "indexRecord"; - private static final int MAX_FIELD_LENGTH = 25000; + private static final int MAX_FIELD_LENGTH = 25000; - private ThreadLocal inputFactory = - ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); + private ThreadLocal inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); - private ThreadLocal outputFactory = - ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); + private ThreadLocal outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); - private ThreadLocal eventFactory = - ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); + private ThreadLocal eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); - private String version; + private String version; - private String dsId; + private String dsId; - private String resultName = DEFAULTDNETRESULT; + private String resultName = DEFAULTDNETRESULT; - public StreamingInputDocumentFactory(final String version, final String dsId) { - this(version, dsId, DEFAULTDNETRESULT); - } + public StreamingInputDocumentFactory(final String version, final String dsId) { + this(version, dsId, DEFAULTDNETRESULT); + } - public StreamingInputDocumentFactory( - final String version, final String dsId, final String resultName) { - this.version = version; - this.dsId = dsId; - this.resultName = resultName; - } + public StreamingInputDocumentFactory( + final String version, final String dsId, final String resultName) { + this.version = version; + this.dsId = dsId; + this.resultName = resultName; + } - public SolrInputDocument parseDocument(final String inputDocument) { + public SolrInputDocument parseDocument(final String inputDocument) { - final StringWriter results = new StringWriter(); - final List nsList = Lists.newLinkedList(); - try { + final StringWriter results = new StringWriter(); + final List nsList = Lists.newLinkedList(); + try { - XMLEventReader parser = - inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); + XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); - final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); + final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); - while (parser.hasNext()) { - final XMLEvent event = parser.nextEvent(); - if ((event != null) && event.isStartElement()) { - final String localName = event.asStartElement().getName().getLocalPart(); + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if ((event != null) && event.isStartElement()) { + final String localName = event.asStartElement().getName().getLocalPart(); - if (ROOT_ELEMENT.equals(localName)) { - nsList.addAll(getNamespaces(event)); - } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { - final XMLEvent text = parser.nextEvent(); - String recordId = getText(text); - indexDocument.addField(INDEX_RECORD_ID, recordId); - } else if (TARGETFIELDS.equals(localName)) { - parseTargetFields(indexDocument, parser); - } else if (resultName.equals(localName)) { - copyResult(indexDocument, results, parser, nsList, resultName); - } - } - } + if (ROOT_ELEMENT.equals(localName)) { + nsList.addAll(getNamespaces(event)); + } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { + final XMLEvent text = parser.nextEvent(); + String recordId = getText(text); + indexDocument.addField(INDEX_RECORD_ID, recordId); + } else if (TARGETFIELDS.equals(localName)) { + parseTargetFields(indexDocument, parser); + } else if (resultName.equals(localName)) { + copyResult(indexDocument, results, parser, nsList, resultName); + } + } + } - if (version != null) { - indexDocument.addField(DS_VERSION, version); - } + if (version != null) { + indexDocument.addField(DS_VERSION, version); + } - if (dsId != null) { - indexDocument.addField(DS_ID, dsId); - } + if (dsId != null) { + indexDocument.addField(DS_ID, dsId); + } - if (!indexDocument.containsKey(INDEX_RECORD_ID)) { - indexDocument.clear(); - System.err.println("missing indexrecord id:\n" + inputDocument); - } + if (!indexDocument.containsKey(INDEX_RECORD_ID)) { + indexDocument.clear(); + System.err.println("missing indexrecord id:\n" + inputDocument); + } - return indexDocument; - } catch (XMLStreamException e) { - return new SolrInputDocument(); - } - } + return indexDocument; + } catch (XMLStreamException e) { + return new SolrInputDocument(); + } + } - private List getNamespaces(final XMLEvent event) { - final List res = Lists.newLinkedList(); - @SuppressWarnings("unchecked") - Iterator nsIter = event.asStartElement().getNamespaces(); - while (nsIter.hasNext()) { - Namespace ns = nsIter.next(); - res.add(ns); - } - return res; - } + private List getNamespaces(final XMLEvent event) { + final List res = Lists.newLinkedList(); + @SuppressWarnings("unchecked") + Iterator nsIter = event.asStartElement().getNamespaces(); + while (nsIter.hasNext()) { + Namespace ns = nsIter.next(); + res.add(ns); + } + return res; + } - /** - * Parse the targetFields block and add fields to the solr document. - * - * @param indexDocument - * @param parser - * @throws XMLStreamException - */ - protected void parseTargetFields( - final SolrInputDocument indexDocument, final XMLEventReader parser) - throws XMLStreamException { + /** + * Parse the targetFields block and add fields to the solr document. + * + * @param indexDocument + * @param parser + * @throws XMLStreamException + */ + protected void parseTargetFields( + final SolrInputDocument indexDocument, final XMLEventReader parser) + throws XMLStreamException { - boolean hasFields = false; + boolean hasFields = false; - while (parser.hasNext()) { - final XMLEvent targetEvent = parser.nextEvent(); - if (targetEvent.isEndElement() - && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { - break; - } + while (parser.hasNext()) { + final XMLEvent targetEvent = parser.nextEvent(); + if (targetEvent.isEndElement() + && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { + break; + } - if (targetEvent.isStartElement()) { - final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); - final XMLEvent text = parser.nextEvent(); + if (targetEvent.isStartElement()) { + final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); + final XMLEvent text = parser.nextEvent(); - String data = getText(text); + String data = getText(text); - addField(indexDocument, fieldName, data); - hasFields = true; - } - } + addField(indexDocument, fieldName, data); + hasFields = true; + } + } - if (!hasFields) { - indexDocument.clear(); - } - } + if (!hasFields) { + indexDocument.clear(); + } + } - /** - * Copy the /indexRecord/result element and children, preserving namespace declarations etc. - * - * @param indexDocument - * @param results - * @param parser - * @param nsList - * @throws XMLStreamException - */ - protected void copyResult( - final SolrInputDocument indexDocument, - final StringWriter results, - final XMLEventReader parser, - final List nsList, - final String dnetResult) - throws XMLStreamException { - final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); + /** + * Copy the /indexRecord/result element and children, preserving namespace declarations etc. + * + * @param indexDocument + * @param results + * @param parser + * @param nsList + * @throws XMLStreamException + */ + protected void copyResult( + final SolrInputDocument indexDocument, + final StringWriter results, + final XMLEventReader parser, + final List nsList, + final String dnetResult) + throws XMLStreamException { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); - for (Namespace ns : nsList) { - eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); - } + for (Namespace ns : nsList) { + eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); + } - StartElement newRecord = - eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); + StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); - // new root record - writer.add(newRecord); + // new root record + writer.add(newRecord); - // copy the rest as it is - while (parser.hasNext()) { - final XMLEvent resultEvent = parser.nextEvent(); + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent resultEvent = parser.nextEvent(); - // TODO: replace with depth tracking instead of close tag tracking. - if (resultEvent.isEndElement() - && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { - writer.add(eventFactory.get().createEndElement("", null, RESULT)); - break; - } + // TODO: replace with depth tracking instead of close tag tracking. + if (resultEvent.isEndElement() + && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { + writer.add(eventFactory.get().createEndElement("", null, RESULT)); + break; + } - writer.add(resultEvent); - } - writer.close(); - indexDocument.addField(INDEX_RESULT, results.toString()); - } + writer.add(resultEvent); + } + writer.close(); + indexDocument.addField(INDEX_RESULT, results.toString()); + } - /** - * Helper used to add a field to a solr doc. It avoids to add empy fields - * - * @param indexDocument - * @param field - * @param value - */ - private final void addField( - final SolrInputDocument indexDocument, final String field, final String value) { - String cleaned = value.trim(); - if (!cleaned.isEmpty()) { - // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); - indexDocument.addField(field.toLowerCase(), cleaned); - } - } + /** + * Helper used to add a field to a solr doc. It avoids to add empy fields + * + * @param indexDocument + * @param field + * @param value + */ + private final void addField( + final SolrInputDocument indexDocument, final String field, final String value) { + String cleaned = value.trim(); + if (!cleaned.isEmpty()) { + // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); + indexDocument.addField(field.toLowerCase(), cleaned); + } + } - /** - * Helper used to get the string from a text element. - * - * @param text - * @return the - */ - protected final String getText(final XMLEvent text) { - if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + - // text.asEndElement().getName().getLocalPart()); - return ""; + /** + * Helper used to get the string from a text element. + * + * @param text + * @return the + */ + protected final String getText(final XMLEvent text) { + if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + + // text.asEndElement().getName().getLocalPart()); + return ""; - final String data = text.asCharacters().getData(); - if (data != null && data.length() > MAX_FIELD_LENGTH) { - return data.substring(0, MAX_FIELD_LENGTH); - } + final String data = text.asCharacters().getData(); + if (data != null && data.length() > MAX_FIELD_LENGTH) { + return data.substring(0, MAX_FIELD_LENGTH); + } - return data; - } + return data; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 7c919d952d..3d9cf1ae78 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -1,113 +1,117 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.OafEntity; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; import org.stringtemplate.v4.ST; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + public class TemplateFactory { - private TemplateResources resources; + private TemplateResources resources; - private static final char DELIMITER = '$'; + private static final char DELIMITER = '$'; - public TemplateFactory() { - try { - resources = new TemplateResources(); - } catch (IOException e) { - throw new IllegalStateException(e); - } - } + public TemplateFactory() { + try { + resources = new TemplateResources(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } - public String buildBody( - final String type, - final List metadata, - final List rels, - final List children, - final List extraInfo) { - ST body = getTemplate(resources.getEntity()); + public String buildBody( + final String type, + final List metadata, + final List rels, + final List children, + final List extraInfo) { + ST body = getTemplate(resources.getEntity()); - body.add("name", type); - body.add("metadata", metadata); - body.add("rels", rels); - body.add("children", children); - body.add("extrainfo", extraInfo); + body.add("name", type); + body.add("metadata", metadata); + body.add("rels", rels); + body.add("children", children); + body.add("extrainfo", extraInfo); - return body.render(); - } + return body.render(); + } - public String getChild(final String name, final String id, final List metadata) { - return getTemplate(resources.getChild()) - .add("name", name) - .add("hasId", !(id == null)) - .add("id", id != null ? escapeXml(removePrefix(id)) : "") - .add("metadata", metadata) - .render(); - } + public String getChild(final String name, final String id, final List metadata) { + return getTemplate(resources.getChild()) + .add("name", name) + .add("hasId", !(id == null)) + .add("id", id != null ? escapeXml(removePrefix(id)) : "") + .add("metadata", metadata) + .render(); + } - public String buildRecord( - final OafEntity entity, final String schemaLocation, final String body) { - return getTemplate(resources.getRecord()) - .add("id", escapeXml(removePrefix(entity.getId()))) - .add("dateofcollection", entity.getDateofcollection()) - .add("dateoftransformation", entity.getDateoftransformation()) - .add("schemaLocation", schemaLocation) - .add("it", body) - .render(); - } + public String buildRecord( + final OafEntity entity, final String schemaLocation, final String body) { + return getTemplate(resources.getRecord()) + .add("id", escapeXml(removePrefix(entity.getId()))) + .add("dateofcollection", entity.getDateofcollection()) + .add("dateoftransformation", entity.getDateoftransformation()) + .add("schemaLocation", schemaLocation) + .add("it", body) + .render(); + } - public String getRel( - final String type, - final String objIdentifier, - final Collection fields, - final String semanticclass, - final String semantischeme, - final DataInfo info) { - return getTemplate(resources.getRel()) - .add("type", type) - .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) - .add("class", semanticclass) - .add("scheme", semantischeme) - .add("metadata", fields) - .add("inferred", info.getInferred()) - .add("trust", info.getTrust()) - .add("inferenceprovenance", info.getInferenceprovenance()) - .add( - "provenanceaction", - info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") - .render(); - } + public String getRel( + final String type, + final String objIdentifier, + final Collection fields, + final String semanticclass, + final String semantischeme, + final DataInfo info) { + return getTemplate(resources.getRel()) + .add("type", type) + .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) + .add("class", semanticclass) + .add("scheme", semantischeme) + .add("metadata", fields) + .add("inferred", info.getInferred()) + .add("trust", info.getTrust()) + .add("inferenceprovenance", info.getInferenceprovenance()) + .add( + "provenanceaction", + info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") + .render(); + } - public String getInstance( - final String resultId, final List instancemetadata, final List webresources) { - return getTemplate(resources.getInstance()) - .add("instanceId", escapeXml(removePrefix(resultId))) - .add("metadata", instancemetadata) - .add( - "webresources", - webresources.stream() - .filter(StringUtils::isNotBlank) - .map(w -> getWebResource(w)) - .collect(Collectors.toList())) - .render(); - } + public String getInstance( + final String resultId, final List instancemetadata, final List webresources) { + return getTemplate(resources.getInstance()) + .add("instanceId", escapeXml(removePrefix(resultId))) + .add("metadata", instancemetadata) + .add( + "webresources", + webresources + .stream() + .filter(StringUtils::isNotBlank) + .map(w -> getWebResource(w)) + .collect(Collectors.toList())) + .render(); + } - private String getWebResource(final String identifier) { - return getTemplate(resources.getWebresource()) - .add("identifier", escapeXml(identifier)) - .render(); - } + private String getWebResource(final String identifier) { + return getTemplate(resources.getWebresource()) + .add("identifier", escapeXml(identifier)) + .render(); + } - // HELPERS + // HELPERS - private ST getTemplate(final String res) { - return new ST(res, DELIMITER, DELIMITER); - } + private ST getTemplate(final String res) { + return new ST(res, DELIMITER, DELIMITER); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java index 3ffc33bd8e..746f8ebe68 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java @@ -1,50 +1,53 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.io.Resources; import java.io.IOException; import java.nio.charset.StandardCharsets; +import com.google.common.io.Resources; + public class TemplateResources { - private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st"); + private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st"); - private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st"); + private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st"); - private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st"); + private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st"); - private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st"); + private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st"); - private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st"); + private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st"); - private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st"); + private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st"); - private static String read(final String classpathResource) throws IOException { - return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); - } + private static String read(final String classpathResource) throws IOException { + return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); + } - public TemplateResources() throws IOException {} + public TemplateResources() throws IOException { + } - public String getEntity() { - return entity; - } + public String getEntity() { + return entity; + } - public String getRecord() { - return record; - } + public String getRecord() { + return record; + } - public String getInstance() { - return instance; - } + public String getInstance() { + return instance; + } - public String getRel() { - return rel; - } + public String getRel() { + return rel; + } - public String getWebresource() { - return webresource; - } + public String getWebresource() { + return webresource; + } - public String getChild() { - return child; - } + public String getChild() { + return child; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 52a5094099..f667d9f3cb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1,23 +1,10 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.commons.lang3.StringUtils.substringBefore; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import com.mycila.xmltool.XMLDoc; -import com.mycila.xmltool.XMLTag; -import eu.dnetlib.dhp.oa.provision.model.*; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.MainEntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.Result; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; @@ -27,9 +14,11 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; + import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; + import org.apache.commons.lang3.StringUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -40,1130 +29,1347 @@ import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; import org.dom4j.io.XMLWriter; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.mycila.xmltool.XMLDoc; +import com.mycila.xmltool.XMLTag; + +import eu.dnetlib.dhp.oa.provision.model.*; +import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.MainEntityType; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Result; + public class XmlRecordFactory implements Serializable { - public static final String REL_SUBTYPE_DEDUP = "dedup"; - private Map accumulators; - - private Set specialDatasourceTypes; - - private ContextMapper contextMapper; - - private String schemaLocation; - - private boolean indent = false; - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public XmlRecordFactory( - final ContextMapper contextMapper, - final boolean indent, - final String schemaLocation, - final String otherDatasourceTypesUForUI) { - - this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); - } - - public XmlRecordFactory( - final Map accumulators, - final ContextMapper contextMapper, - final boolean indent, - final String schemaLocation, - final String otherDatasourceTypesUForUI) { - - this.accumulators = accumulators; - this.contextMapper = contextMapper; - this.schemaLocation = schemaLocation; - this.specialDatasourceTypes = - Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); - - this.indent = indent; - } - - public String build(final JoinedEntity je) { - - final Set contexts = Sets.newHashSet(); - - final OafEntity entity = toOafEntity(je.getEntity()); - TemplateFactory templateFactory = new TemplateFactory(); - try { - final EntityType type = EntityType.valueOf(je.getEntity().getType()); - final List metadata = metadata(type, entity, contexts); - - // rels has to be processed before the contexts because they enrich the contextMap with - // the - // funding info. - final List relations = - je.getLinks().stream() - .filter(t -> !REL_SUBTYPE_DEDUP.equalsIgnoreCase(t.getRelation().getSubRelType())) - .map(link -> mapRelation(link, templateFactory, contexts)) - .collect(Collectors.toCollection(ArrayList::new)); - - final String mainType = ModelSupport.getMainType(type); - metadata.addAll(buildContexts(mainType, contexts)); - metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo())); - - final String body = - templateFactory.buildBody( - mainType, - metadata, - relations, - listChildren(entity, je, templateFactory), - listExtraInfo(entity)); - - return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); - } catch (final Throwable e) { - throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); - } - } - - private static OafEntity toOafEntity(TypedRow typedRow) { - return parseOaf(typedRow.getOaf(), typedRow.getType()); - } - - private static OafEntity parseOaf(final String json, final String type) { - try { - switch (EntityType.valueOf(type)) { - case publication: - return OBJECT_MAPPER.readValue(json, Publication.class); - case dataset: - return OBJECT_MAPPER.readValue(json, Dataset.class); - case otherresearchproduct: - return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class); - case software: - return OBJECT_MAPPER.readValue(json, Software.class); - case datasource: - return OBJECT_MAPPER.readValue(json, Datasource.class); - case organization: - return OBJECT_MAPPER.readValue(json, Organization.class); - case project: - return OBJECT_MAPPER.readValue(json, Project.class); - default: - throw new IllegalArgumentException("invalid type: " + type); - } - } catch (IOException e) { - throw new IllegalArgumentException(e); - } - } - - private String printXML(String xml, boolean indent) { - try { - final Document doc = new SAXReader().read(new StringReader(xml)); - OutputFormat format = - indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); - format.setExpandEmptyElements(false); - format.setSuppressDeclaration(true); - StringWriter sw = new StringWriter(); - XMLWriter writer = new XMLWriter(sw, format); - writer.write(doc); - return sw.toString(); - } catch (IOException | DocumentException e) { - throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); - } - } - - private List metadata( - final EntityType type, final OafEntity entity, final Set contexts) { - - final List metadata = Lists.newArrayList(); - - if (entity.getCollectedfrom() != null) { - metadata.addAll( - entity.getCollectedfrom().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); - } - if (entity.getOriginalId() != null) { - metadata.addAll( - entity.getOriginalId().stream() - .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) - .collect(Collectors.toList())); - } - if (entity.getPid() != null) { - metadata.addAll( - entity.getPid().stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); - } - - if (ModelSupport.isResult(type)) { - final Result r = (Result) entity; - - if (r.getContext() != null) { - contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList())); - /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ - if (contexts.contains("dh-ch::subcommunity::2")) { - contexts.add("clarin"); - } - } - - if (r.getTitle() != null) { - metadata.addAll( - r.getTitle().stream() - .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) - .collect(Collectors.toList())); - } - if (r.getBestaccessright() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", r.getBestaccessright())); - } - if (r.getAuthor() != null) { - metadata.addAll( - r.getAuthor().stream() - .map( - a -> { - final StringBuilder sb = - new StringBuilder(" - isNotBlank(sp.getQualifier().getClassid()) - && isNotBlank(sp.getValue())) - .forEach( - sp -> { - String pidType = - XmlSerializationUtils.escapeXml( - sp.getQualifier().getClassid()) - .replaceAll("\\W", ""); - String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); - - // ugly hack: some records - // provide swapped pidtype and - // pidvalue - if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { - sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); - } else { - pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); - if (isNotBlank(pidType)) { - sb.append( - String.format( - " %s=\"%s\"", - pidType, - pidValue.toLowerCase().replaceAll("orcid", ""))); - } - } - }); - } - sb.append( - ">" + XmlSerializationUtils.escapeXml(a.getFullname()) + ""); - return sb.toString(); - }) - .collect(Collectors.toList())); - } - if (r.getContributor() != null) { - metadata.addAll( - r.getContributor().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getCountry() != null) { - metadata.addAll( - r.getCountry().stream() - .map(c -> XmlSerializationUtils.mapQualifier("country", c)) - .collect(Collectors.toList())); - } - if (r.getCoverage() != null) { - metadata.addAll( - r.getCoverage().stream() - .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getDateofacceptance() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dateofacceptance", r.getDateofacceptance().getValue())); - } - if (r.getDescription() != null) { - metadata.addAll( - r.getDescription().stream() - .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getEmbargoenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); - } - if (r.getSubject() != null) { - metadata.addAll( - r.getSubject().stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) - .collect(Collectors.toList())); - } - if (r.getLanguage() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("language", r.getLanguage())); - } - if (r.getRelevantdate() != null) { - metadata.addAll( - r.getRelevantdate().stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) - .collect(Collectors.toList())); - } - if (r.getPublisher() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", r.getPublisher().getValue())); - } - if (r.getSource() != null) { - metadata.addAll( - r.getSource().stream() - .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getFormat() != null) { - metadata.addAll( - r.getFormat().stream() - .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getResulttype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("resulttype", r.getResulttype())); - } - if (r.getResourcetype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); - } - } - - switch (type) { - case publication: - final Publication pub = (Publication) entity; - - if (pub.getJournal() != null) { - final Journal j = pub.getJournal(); - metadata.add(XmlSerializationUtils.mapJournal(j)); - } - - break; - case dataset: - final Dataset d = (Dataset) entity; - if (d.getDevice() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); - } - if (d.getLastmetadataupdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "lastmetadataupdate", d.getLastmetadataupdate().getValue())); - } - if (d.getMetadataversionnumber() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "metadataversionnumber", d.getMetadataversionnumber().getValue())); - } - if (d.getSize() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); - } - if (d.getStoragedate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("storagedate", d.getStoragedate().getValue())); - } - if (d.getVersion() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("version", d.getVersion().getValue())); - } - // TODO d.getGeolocation() - - break; - case otherresearchproduct: - final OtherResearchProduct orp = (OtherResearchProduct) entity; - - if (orp.getContactperson() != null) { - metadata.addAll( - orp.getContactperson().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) - .collect(Collectors.toList())); - } - - if (orp.getContactgroup() != null) { - metadata.addAll( - orp.getContactgroup().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) - .collect(Collectors.toList())); - } - if (orp.getTool() != null) { - metadata.addAll( - orp.getTool().stream() - .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) - .collect(Collectors.toList())); - } - break; - case software: - final Software s = (Software) entity; - - if (s.getDocumentationUrl() != null) { - metadata.addAll( - s.getDocumentationUrl().stream() - .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) - .collect(Collectors.toList())); - } - if (s.getLicense() != null) { - metadata.addAll( - s.getLicense().stream() - .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) - .collect(Collectors.toList())); - } - if (s.getCodeRepositoryUrl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); - } - if (s.getProgrammingLanguage() != null) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "programmingLanguage", s.getProgrammingLanguage())); - } - break; - case datasource: - final Datasource ds = (Datasource) entity; - - if (ds.getDatasourcetype() != null) { - mapDatasourceType(metadata, ds.getDatasourcetype()); - } - if (ds.getOpenairecompatibility() != null) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "openairecompatibility", ds.getOpenairecompatibility())); - } - if (ds.getOfficialname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("officialname", ds.getOfficialname().getValue())); - } - if (ds.getEnglishname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("englishname", ds.getEnglishname().getValue())); - } - if (ds.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); - } - if (ds.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("logourl", ds.getLogourl().getValue())); - } - if (ds.getContactemail() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactemail", ds.getContactemail().getValue())); - } - if (ds.getNamespaceprefix() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "namespaceprefix", ds.getNamespaceprefix().getValue())); - } - if (ds.getLatitude() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("latitude", ds.getLatitude().getValue())); - } - if (ds.getLongitude() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("longitude", ds.getLongitude().getValue())); - } - if (ds.getDateofvalidation() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dateofvalidation", ds.getDateofvalidation().getValue())); - } - if (ds.getDescription() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue())); - } - if (ds.getOdnumberofitems() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "odnumberofitems", ds.getOdnumberofitems().getValue())); - } - if (ds.getOdnumberofitemsdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); - } - if (ds.getOdpolicies() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("odpolicies", ds.getOdpolicies().getValue())); - } - if (ds.getOdlanguages() != null) { - metadata.addAll( - ds.getOdlanguages().stream() - .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getOdcontenttypes() != null) { - metadata.addAll( - ds.getOdcontenttypes().stream() - .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getAccessinfopackage() != null) { - metadata.addAll( - ds.getAccessinfopackage().stream() - .map(c -> XmlSerializationUtils.asXmlElement("accessinfopackage", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getReleaseenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "releasestartdate", ds.getReleaseenddate().getValue())); - } - if (ds.getReleaseenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "releaseenddate", ds.getReleaseenddate().getValue())); - } - if (ds.getMissionstatementurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "missionstatementurl", ds.getMissionstatementurl().getValue())); - } - if (ds.getDataprovider() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dataprovider", ds.getDataprovider().getValue().toString())); - } - if (ds.getServiceprovider() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "serviceprovider", ds.getServiceprovider().getValue().toString())); - } - if (ds.getDatabaseaccesstype() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "databaseaccesstype", ds.getDatabaseaccesstype().getValue())); - } - if (ds.getDatauploadtype() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "datauploadtype", ds.getDatauploadtype().getValue())); - } - if (ds.getDatabaseaccessrestriction() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); - } - if (ds.getDatauploadrestriction() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "datauploadrestriction", ds.getDatauploadrestriction().getValue())); - } - if (ds.getVersioning() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "versioning", ds.getVersioning().getValue().toString())); - } - if (ds.getCitationguidelineurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "citationguidelineurl", ds.getCitationguidelineurl().getValue())); - } - if (ds.getQualitymanagementkind() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "qualitymanagementkind", ds.getQualitymanagementkind().getValue())); - } - if (ds.getPidsystems() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue())); - } - if (ds.getCertificates() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("certificates", ds.getCertificates().getValue())); - } - if (ds.getPolicies() != null) { - metadata.addAll( - ds.getPolicies().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) - .collect(Collectors.toList())); - } - if (ds.getJournal() != null) { - metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal())); - } - if (ds.getSubjects() != null) { - metadata.addAll( - ds.getSubjects().stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) - .collect(Collectors.toList())); - } - - break; - case organization: - final Organization o = (Organization) entity; - - if (o.getLegalshortname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "legalshortname", o.getLegalshortname().getValue())); - } - if (o.getLegalname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("legalname", o.getLegalname().getValue())); - } - if (o.getAlternativeNames() != null) { - metadata.addAll( - o.getAlternativeNames().stream() - .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) - .collect(Collectors.toList())); - } - if (o.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue())); - } - if (o.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue())); - } - - if (o.getEclegalbody() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("eclegalbody", o.getEclegalbody().getValue())); - } - if (o.getEclegalperson() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("eclegalperson", o.getEclegalperson().getValue())); - } - if (o.getEcnonprofit() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); - } - if (o.getEcresearchorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecresearchorganization", o.getEcresearchorganization().getValue())); - } - if (o.getEchighereducation() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "echighereducation", o.getEchighereducation().getValue())); - } - if (o.getEcinternationalorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecinternationalorganizationeurinterests", - o.getEcinternationalorganization().getValue())); - } - if (o.getEcinternationalorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecinternationalorganization", o.getEcinternationalorganization().getValue())); - } - if (o.getEcenterprise() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecenterprise", o.getEcenterprise().getValue())); - } - if (o.getEcsmevalidated() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecsmevalidated", o.getEcsmevalidated().getValue())); - } - if (o.getEcnutscode() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecnutscode", o.getEcnutscode().getValue())); - } - if (o.getCountry() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("country", o.getCountry())); - } - - break; - case project: - final Project p = (Project) entity; - - if (p.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", p.getWebsiteurl().getValue())); - } - if (p.getCode() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); - } - if (p.getAcronym() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", p.getAcronym().getValue())); - } - if (p.getTitle() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); - } - if (p.getStartdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("startdate", p.getStartdate().getValue())); - } - if (p.getEnddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("enddate", p.getEnddate().getValue())); - } - if (p.getCallidentifier() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "callidentifier", p.getCallidentifier().getValue())); - } - if (p.getKeywords() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("keywords", p.getKeywords().getValue())); - } - if (p.getDuration() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("duration", p.getDuration().getValue())); - } - if (p.getEcarticle29_3() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); - } - if (p.getSubjects() != null) { - metadata.addAll( - p.getSubjects().stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) - .collect(Collectors.toList())); - } - if (p.getContracttype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); - } - if (p.getEcsc39() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); - } - if (p.getContactfullname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "contactfullname", p.getContactfullname().getValue())); - } - if (p.getContactfax() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue())); - } - if (p.getContactphone() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue())); - } - if (p.getContactemail() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue())); - } - if (p.getSummary() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue())); - } - if (p.getCurrency() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("currency", p.getCurrency().getValue())); - } - if (p.getTotalcost() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("totalcost", p.getTotalcost().toString())); - } - if (p.getFundedamount() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("fundedamount", p.getFundedamount().toString())); - } - if (p.getFundingtree() != null) { - metadata.addAll( - p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); - } - - break; - default: - throw new IllegalArgumentException("invalid entity type: " + type); - } - - return metadata; - } - - private void mapDatasourceType(List metadata, final Qualifier dsType) { - metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); - - if (specialDatasourceTypes.contains(dsType.getClassid())) { - dsType.setClassid("other"); - dsType.setClassname("other"); - } - metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", dsType)); - } - - private String mapRelation(Tuple2 link, TemplateFactory templateFactory, Set contexts) { - final Relation rel = link.getRelation(); - final RelatedEntity re = link.getRelatedEntity(); - final String targetType = link.getRelatedEntity().getType(); - - final List metadata = Lists.newArrayList(); - switch (EntityType.valueOf(targetType)) { - case publication: - case dataset: - case otherresearchproduct: - case software: - if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { - metadata.add(XmlSerializationUtils.mapStructuredProperty("title", re.getTitle())); - } - if (isNotBlank(re.getDateofacceptance())) { - metadata.add( - XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); - } - if (isNotBlank(re.getPublisher())) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); - } - if (isNotBlank(re.getCodeRepositoryUrl())) { - metadata.add( - XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); - } - if (re.getResulttype() != null & re.getResulttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); - } - if (re.getCollectedfrom() != null) { - metadata.addAll( - re.getCollectedfrom().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); - } - if (re.getPid() != null) { - metadata.addAll( - re.getPid().stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); - } - break; - case datasource: - if (isNotBlank(re.getOfficialname())) { - metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); - } - if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { - mapDatasourceType(metadata, re.getDatasourcetype()); - } - if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "openairecompatibility", re.getOpenairecompatibility())); - } - break; - case organization: - if (isNotBlank(re.getLegalname())) { - metadata.add(XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); - } - if (isNotBlank(re.getLegalshortname())) { - metadata.add( - XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); - } - if (re.getCountry() != null & !re.getCountry().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); - } - break; - case project: - if (isNotBlank(re.getProjectTitle())) { - metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); - } - if (isNotBlank(re.getCode())) { - metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode())); - } - if (isNotBlank(re.getAcronym())) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); - } - if (re.getContracttype() != null & !re.getContracttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); - } - if (re.getFundingtree() != null & contexts != null) { - metadata.addAll( - re.getFundingtree().stream() - .peek(ft -> fillContextMap(ft, contexts)) - .map(ft -> getRelFundingTree(ft)) - .collect(Collectors.toList())); - } - break; - default: - throw new IllegalArgumentException("invalid target type: " + targetType); - } - final DataInfo info = rel.getDataInfo(); - final String scheme = ModelSupport.getScheme(re.getType(), targetType); - - if (StringUtils.isBlank(scheme)) { - throw new IllegalArgumentException( - String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); - } - - final String accumulatorName = - getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(1); - } - - return templateFactory.getRel( - targetType, rel.getTarget(), Sets.newHashSet(metadata), rel.getRelClass(), scheme, info); - } - - private List listChildren( - final OafEntity entity, JoinedEntity je, TemplateFactory templateFactory) { - - final List children = Lists.newArrayList(); - EntityType entityType = EntityType.valueOf(je.getEntity().getType()); - - children.addAll( - je.getLinks().stream() - .filter(link -> REL_SUBTYPE_DEDUP.equalsIgnoreCase(link.getRelation().getSubRelType())) - .map(link -> mapRelation(link, templateFactory, null)) - .collect(Collectors.toCollection(ArrayList::new))); - - if (MainEntityType.result.toString().equals(ModelSupport.getMainType(entityType))) { - final List instances = ((Result) entity).getInstance(); - if (instances != null) { - for (final Instance instance : ((Result) entity).getInstance()) { - - final List fields = Lists.newArrayList(); - - if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { - fields.add( - XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); - } - if (instance.getCollectedfrom() != null) { - fields.add( - XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); - } - if (instance.getHostedby() != null) { - fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); - } - if (instance.getDateofacceptance() != null - && isNotBlank(instance.getDateofacceptance().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "dateofacceptance", instance.getDateofacceptance().getValue())); - } - if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { - fields.add( - XmlSerializationUtils.mapQualifier("instancetype", instance.getInstancetype())); - } - if (isNotBlank(instance.getDistributionlocation())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "distributionlocation", instance.getDistributionlocation())); - } - if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); - } - if (instance.getProcessingchargeamount() != null - && isNotBlank(instance.getProcessingchargeamount().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "processingchargeamount", instance.getProcessingchargeamount().getValue())); - } - if (instance.getProcessingchargecurrency() != null - && isNotBlank(instance.getProcessingchargecurrency().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "processingchargecurrency", instance.getProcessingchargecurrency().getValue())); - } - - children.add( - templateFactory.getInstance( - instance.getHostedby().getKey(), fields, instance.getUrl())); - } - } - final List ext = ((Result) entity).getExternalReference(); - if (ext != null) { - for (final ExternalReference er : ((Result) entity).getExternalReference()) { - - final List fields = Lists.newArrayList(); - - if (isNotBlank(er.getSitename())) { - fields.add(XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); - } - if (isNotBlank(er.getLabel())) { - fields.add(XmlSerializationUtils.asXmlElement("label", er.getLabel())); - } - if (isNotBlank(er.getUrl())) { - fields.add(XmlSerializationUtils.asXmlElement("url", er.getUrl())); - } - if (isNotBlank(er.getDescription())) { - fields.add(XmlSerializationUtils.asXmlElement("description", er.getDescription())); - } - if (isNotBlank(er.getUrl())) { - fields.add(XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); - } - if (isNotBlank(er.getRefidentifier())) { - fields.add(XmlSerializationUtils.asXmlElement("refidentifier", er.getRefidentifier())); - } - if (isNotBlank(er.getQuery())) { - fields.add(XmlSerializationUtils.asXmlElement("query", er.getQuery())); - } - - children.add(templateFactory.getChild("externalreference", null, fields)); - } - } - } - - return children; - } - - private List listExtraInfo(OafEntity entity) { - final List extraInfo = entity.getExtraInfo(); - return extraInfo != null - ? extraInfo.stream() - .map(e -> XmlSerializationUtils.mapExtraInfo(e)) - .collect(Collectors.toList()) - : Lists.newArrayList(); - } - - private List buildContexts(final String type, final Set contexts) { - final List res = Lists.newArrayList(); - - if ((contextMapper != null) - && !contextMapper.isEmpty() - && MainEntityType.result.toString().equals(type)) { - - XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); - - for (final String context : contexts) { - - String id = ""; - for (final String token : Splitter.on("::").split(context)) { - id += token; - - final ContextDef def = contextMapper.get(id); - - if (def == null) { - continue; - // throw new IllegalStateException(String.format("cannot find context for id - // '%s'", - // id)); - } - - if (def.getName().equals("context")) { - final String xpath = "//context/@id='" + def.getId() + "'"; - if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { - document = addContextDef(document.gotoRoot(), def); - } - } - - if (def.getName().equals("category")) { - final String rootId = substringBefore(def.getId(), "::"); - document = - addContextDef( - document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), - def); - } - - if (def.getName().equals("concept")) { - document = addContextDef(document, def).gotoParent(); - } - id += "::"; - } - } - final Transformer transformer = getTransformer(); - for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { - try { - res.add(asStringElement(x, transformer)); - } catch (final TransformerException e) { - throw new RuntimeException(e); - } - } - } - - return res; - } - - private Transformer getTransformer() { - try { - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - return transformer; - } catch (TransformerConfigurationException e) { - throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); - } - } - - private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { - tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); - if ((def.getType() != null) && !def.getType().isEmpty()) { - tag.addAttribute("type", def.getType()); - } - return tag; - } - - private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) - throws TransformerException { - final StringWriter buffer = new StringWriter(); - transformer.transform(new DOMSource(element), new StreamResult(buffer)); - return buffer.toString(); - } - - private void fillContextMap(final String xmlTree, final Set contexts) { - - Document fundingPath; - try { - fundingPath = new SAXReader().read(new StringReader(xmlTree)); - } catch (final DocumentException e) { - throw new RuntimeException(e); - } - try { - final Node funder = fundingPath.selectSingleNode("//funder"); - - if (funder != null) { - - final String funderShortName = funder.valueOf("./shortname"); - contexts.add(funderShortName); - - contextMapper.put( - funderShortName, - new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); - final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); - if (level0 != null) { - final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); - contextMapper.put( - level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); - final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); - if (level1 == null) { - contexts.add(level0Id); - } else { - final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); - contextMapper.put( - level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); - final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); - if (level2 == null) { - contexts.add(level1Id); - } else { - final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); - contextMapper.put( - level2Id, - new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); - contexts.add(level2Id); - } - } - } - } - } catch (final NullPointerException e) { - throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); - } - } - - @SuppressWarnings("unchecked") - protected static String getRelFundingTree(final String xmlTree) { - String funding = ""; - try { - final Document ftree = new SAXReader().read(new StringReader(xmlTree)); - funding = ""; - - funding += getFunderElement(ftree); - - for (final Object o : - Lists.reverse( - ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { - final Element e = (Element) o; - final String _id = e.valueOf("./id"); - funding += - "<" - + e.getName() - + " name=\"" - + XmlSerializationUtils.escapeXml(e.valueOf("./name")) - + "\">" - + XmlSerializationUtils.escapeXml(_id) - + ""; - } - } catch (final DocumentException e) { - throw new IllegalArgumentException( - "unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); - } finally { - funding += ""; - } - return funding; - } - - private static String getFunderElement(final Document ftree) { - final String funderId = ftree.valueOf("//fundingtree/funder/id"); - final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); - final String funderName = ftree.valueOf("//fundingtree/funder/name"); - final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); - - return ""; - } + public static final String REL_SUBTYPE_DEDUP = "dedup"; + private Map accumulators; + + private Set specialDatasourceTypes; + + private ContextMapper contextMapper; + + private String schemaLocation; + + private boolean indent = false; + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public XmlRecordFactory( + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { + + this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); + } + + public XmlRecordFactory( + final Map accumulators, + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { + + this.accumulators = accumulators; + this.contextMapper = contextMapper; + this.schemaLocation = schemaLocation; + this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); + + this.indent = indent; + } + + public String build(final JoinedEntity je) { + + final Set contexts = Sets.newHashSet(); + + final OafEntity entity = toOafEntity(je.getEntity()); + TemplateFactory templateFactory = new TemplateFactory(); + try { + final EntityType type = EntityType.valueOf(je.getEntity().getType()); + final List metadata = metadata(type, entity, contexts); + + // rels has to be processed before the contexts because they enrich the contextMap with + // the + // funding info. + final List relations = je + .getLinks() + .stream() + .filter(t -> !REL_SUBTYPE_DEDUP.equalsIgnoreCase(t.getRelation().getSubRelType())) + .map(link -> mapRelation(link, templateFactory, contexts)) + .collect(Collectors.toCollection(ArrayList::new)); + + final String mainType = ModelSupport.getMainType(type); + metadata.addAll(buildContexts(mainType, contexts)); + metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo())); + + final String body = templateFactory + .buildBody( + mainType, + metadata, + relations, + listChildren(entity, je, templateFactory), + listExtraInfo(entity)); + + return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); + } catch (final Throwable e) { + throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); + } + } + + private static OafEntity toOafEntity(TypedRow typedRow) { + return parseOaf(typedRow.getOaf(), typedRow.getType()); + } + + private static OafEntity parseOaf(final String json, final String type) { + try { + switch (EntityType.valueOf(type)) { + case publication: + return OBJECT_MAPPER.readValue(json, Publication.class); + case dataset: + return OBJECT_MAPPER.readValue(json, Dataset.class); + case otherresearchproduct: + return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class); + case software: + return OBJECT_MAPPER.readValue(json, Software.class); + case datasource: + return OBJECT_MAPPER.readValue(json, Datasource.class); + case organization: + return OBJECT_MAPPER.readValue(json, Organization.class); + case project: + return OBJECT_MAPPER.readValue(json, Project.class); + default: + throw new IllegalArgumentException("invalid type: " + type); + } + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + private String printXML(String xml, boolean indent) { + try { + final Document doc = new SAXReader().read(new StringReader(xml)); + OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); + format.setExpandEmptyElements(false); + format.setSuppressDeclaration(true); + StringWriter sw = new StringWriter(); + XMLWriter writer = new XMLWriter(sw, format); + writer.write(doc); + return sw.toString(); + } catch (IOException | DocumentException e) { + throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); + } + } + + private List metadata( + final EntityType type, final OafEntity entity, final Set contexts) { + + final List metadata = Lists.newArrayList(); + + if (entity.getCollectedfrom() != null) { + metadata + .addAll( + entity + .getCollectedfrom() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (entity.getOriginalId() != null) { + metadata + .addAll( + entity + .getOriginalId() + .stream() + .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) + .collect(Collectors.toList())); + } + if (entity.getPid() != null) { + metadata + .addAll( + entity + .getPid() + .stream() + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + + if (ModelSupport.isResult(type)) { + final Result r = (Result) entity; + + if (r.getContext() != null) { + contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList())); + /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ + if (contexts.contains("dh-ch::subcommunity::2")) { + contexts.add("clarin"); + } + } + + if (r.getTitle() != null) { + metadata + .addAll( + r + .getTitle() + .stream() + .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) + .collect(Collectors.toList())); + } + if (r.getBestaccessright() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", r.getBestaccessright())); + } + if (r.getAuthor() != null) { + metadata + .addAll( + r + .getAuthor() + .stream() + .map( + a -> { + final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) + && isNotBlank(sp.getValue())) + .forEach( + sp -> { + String pidType = XmlSerializationUtils + .escapeXml( + sp.getQualifier().getClassid()) + .replaceAll("\\W", ""); + String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); + + // ugly hack: some records + // provide swapped pidtype and + // pidvalue + if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { + sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); + } else { + pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); + if (isNotBlank(pidType)) { + sb + .append( + String + .format( + " %s=\"%s\"", + pidType, + pidValue + .toLowerCase() + .replaceAll("orcid", ""))); + } + } + }); + } + sb + .append( + ">" + XmlSerializationUtils.escapeXml(a.getFullname()) + ""); + return sb.toString(); + }) + .collect(Collectors.toList())); + } + if (r.getContributor() != null) { + metadata + .addAll( + r + .getContributor() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getCountry() != null) { + metadata + .addAll( + r + .getCountry() + .stream() + .map(c -> XmlSerializationUtils.mapQualifier("country", c)) + .collect(Collectors.toList())); + } + if (r.getCoverage() != null) { + metadata + .addAll( + r + .getCoverage() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getDateofacceptance() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dateofacceptance", r.getDateofacceptance().getValue())); + } + if (r.getDescription() != null) { + metadata + .addAll( + r + .getDescription() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getEmbargoenddate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); + } + if (r.getSubject() != null) { + metadata + .addAll( + r + .getSubject() + .stream() + .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) + .collect(Collectors.toList())); + } + if (r.getLanguage() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("language", r.getLanguage())); + } + if (r.getRelevantdate() != null) { + metadata + .addAll( + r + .getRelevantdate() + .stream() + .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) + .collect(Collectors.toList())); + } + if (r.getPublisher() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("publisher", r.getPublisher().getValue())); + } + if (r.getSource() != null) { + metadata + .addAll( + r + .getSource() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getFormat() != null) { + metadata + .addAll( + r + .getFormat() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getResulttype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("resulttype", r.getResulttype())); + } + if (r.getResourcetype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); + } + } + + switch (type) { + case publication: + final Publication pub = (Publication) entity; + + if (pub.getJournal() != null) { + final Journal j = pub.getJournal(); + metadata.add(XmlSerializationUtils.mapJournal(j)); + } + + break; + case dataset: + final Dataset d = (Dataset) entity; + if (d.getDevice() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); + } + if (d.getLastmetadataupdate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "lastmetadataupdate", d.getLastmetadataupdate().getValue())); + } + if (d.getMetadataversionnumber() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "metadataversionnumber", d.getMetadataversionnumber().getValue())); + } + if (d.getSize() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); + } + if (d.getStoragedate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("storagedate", d.getStoragedate().getValue())); + } + if (d.getVersion() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("version", d.getVersion().getValue())); + } + // TODO d.getGeolocation() + + break; + case otherresearchproduct: + final OtherResearchProduct orp = (OtherResearchProduct) entity; + + if (orp.getContactperson() != null) { + metadata + .addAll( + orp + .getContactperson() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) + .collect(Collectors.toList())); + } + + if (orp.getContactgroup() != null) { + metadata + .addAll( + orp + .getContactgroup() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) + .collect(Collectors.toList())); + } + if (orp.getTool() != null) { + metadata + .addAll( + orp + .getTool() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) + .collect(Collectors.toList())); + } + break; + case software: + final Software s = (Software) entity; + + if (s.getDocumentationUrl() != null) { + metadata + .addAll( + s + .getDocumentationUrl() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) + .collect(Collectors.toList())); + } + if (s.getLicense() != null) { + metadata + .addAll( + s + .getLicense() + .stream() + .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) + .collect(Collectors.toList())); + } + if (s.getCodeRepositoryUrl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); + } + if (s.getProgrammingLanguage() != null) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "programmingLanguage", s.getProgrammingLanguage())); + } + break; + case datasource: + final Datasource ds = (Datasource) entity; + + if (ds.getDatasourcetype() != null) { + mapDatasourceType(metadata, ds.getDatasourcetype()); + } + if (ds.getOpenairecompatibility() != null) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "openairecompatibility", ds.getOpenairecompatibility())); + } + if (ds.getOfficialname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("officialname", ds.getOfficialname().getValue())); + } + if (ds.getEnglishname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("englishname", ds.getEnglishname().getValue())); + } + if (ds.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); + } + if (ds.getLogourl() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("logourl", ds.getLogourl().getValue())); + } + if (ds.getContactemail() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactemail", ds.getContactemail().getValue())); + } + if (ds.getNamespaceprefix() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "namespaceprefix", ds.getNamespaceprefix().getValue())); + } + if (ds.getLatitude() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("latitude", ds.getLatitude().getValue())); + } + if (ds.getLongitude() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("longitude", ds.getLongitude().getValue())); + } + if (ds.getDateofvalidation() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dateofvalidation", ds.getDateofvalidation().getValue())); + } + if (ds.getDescription() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue())); + } + if (ds.getOdnumberofitems() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "odnumberofitems", ds.getOdnumberofitems().getValue())); + } + if (ds.getOdnumberofitemsdate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); + } + if (ds.getOdpolicies() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("odpolicies", ds.getOdpolicies().getValue())); + } + if (ds.getOdlanguages() != null) { + metadata + .addAll( + ds + .getOdlanguages() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getOdcontenttypes() != null) { + metadata + .addAll( + ds + .getOdcontenttypes() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getAccessinfopackage() != null) { + metadata + .addAll( + ds + .getAccessinfopackage() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("accessinfopackage", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getReleaseenddate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "releasestartdate", ds.getReleaseenddate().getValue())); + } + if (ds.getReleaseenddate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "releaseenddate", ds.getReleaseenddate().getValue())); + } + if (ds.getMissionstatementurl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "missionstatementurl", ds.getMissionstatementurl().getValue())); + } + if (ds.getDataprovider() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dataprovider", ds.getDataprovider().getValue().toString())); + } + if (ds.getServiceprovider() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "serviceprovider", ds.getServiceprovider().getValue().toString())); + } + if (ds.getDatabaseaccesstype() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "databaseaccesstype", ds.getDatabaseaccesstype().getValue())); + } + if (ds.getDatauploadtype() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "datauploadtype", ds.getDatauploadtype().getValue())); + } + if (ds.getDatabaseaccessrestriction() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); + } + if (ds.getDatauploadrestriction() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "datauploadrestriction", ds.getDatauploadrestriction().getValue())); + } + if (ds.getVersioning() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "versioning", ds.getVersioning().getValue().toString())); + } + if (ds.getCitationguidelineurl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "citationguidelineurl", ds.getCitationguidelineurl().getValue())); + } + if (ds.getQualitymanagementkind() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "qualitymanagementkind", ds.getQualitymanagementkind().getValue())); + } + if (ds.getPidsystems() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue())); + } + if (ds.getCertificates() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("certificates", ds.getCertificates().getValue())); + } + if (ds.getPolicies() != null) { + metadata + .addAll( + ds + .getPolicies() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) + .collect(Collectors.toList())); + } + if (ds.getJournal() != null) { + metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal())); + } + if (ds.getSubjects() != null) { + metadata + .addAll( + ds + .getSubjects() + .stream() + .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) + .collect(Collectors.toList())); + } + + break; + case organization: + final Organization o = (Organization) entity; + + if (o.getLegalshortname() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "legalshortname", o.getLegalshortname().getValue())); + } + if (o.getLegalname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("legalname", o.getLegalname().getValue())); + } + if (o.getAlternativeNames() != null) { + metadata + .addAll( + o + .getAlternativeNames() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) + .collect(Collectors.toList())); + } + if (o.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue())); + } + if (o.getLogourl() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue())); + } + + if (o.getEclegalbody() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("eclegalbody", o.getEclegalbody().getValue())); + } + if (o.getEclegalperson() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("eclegalperson", o.getEclegalperson().getValue())); + } + if (o.getEcnonprofit() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); + } + if (o.getEcresearchorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecresearchorganization", o.getEcresearchorganization().getValue())); + } + if (o.getEchighereducation() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "echighereducation", o.getEchighereducation().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecinternationalorganizationeurinterests", + o.getEcinternationalorganization().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecinternationalorganization", o.getEcinternationalorganization().getValue())); + } + if (o.getEcenterprise() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecenterprise", o.getEcenterprise().getValue())); + } + if (o.getEcsmevalidated() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecsmevalidated", o.getEcsmevalidated().getValue())); + } + if (o.getEcnutscode() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecnutscode", o.getEcnutscode().getValue())); + } + if (o.getCountry() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("country", o.getCountry())); + } + + break; + case project: + final Project p = (Project) entity; + + if (p.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", p.getWebsiteurl().getValue())); + } + if (p.getCode() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); + } + if (p.getAcronym() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("acronym", p.getAcronym().getValue())); + } + if (p.getTitle() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); + } + if (p.getStartdate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("startdate", p.getStartdate().getValue())); + } + if (p.getEnddate() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("enddate", p.getEnddate().getValue())); + } + if (p.getCallidentifier() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "callidentifier", p.getCallidentifier().getValue())); + } + if (p.getKeywords() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("keywords", p.getKeywords().getValue())); + } + if (p.getDuration() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("duration", p.getDuration().getValue())); + } + if (p.getEcarticle29_3() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); + } + if (p.getSubjects() != null) { + metadata + .addAll( + p + .getSubjects() + .stream() + .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) + .collect(Collectors.toList())); + } + if (p.getContracttype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); + } + if (p.getEcsc39() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); + } + if (p.getContactfullname() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "contactfullname", p.getContactfullname().getValue())); + } + if (p.getContactfax() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue())); + } + if (p.getContactphone() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue())); + } + if (p.getContactemail() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue())); + } + if (p.getSummary() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue())); + } + if (p.getCurrency() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("currency", p.getCurrency().getValue())); + } + if (p.getTotalcost() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("totalcost", p.getTotalcost().toString())); + } + if (p.getFundedamount() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("fundedamount", p.getFundedamount().toString())); + } + if (p.getFundingtree() != null) { + metadata + .addAll( + p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); + } + + break; + default: + throw new IllegalArgumentException("invalid entity type: " + type); + } + + return metadata; + } + + private void mapDatasourceType(List metadata, final Qualifier dsType) { + metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); + + if (specialDatasourceTypes.contains(dsType.getClassid())) { + dsType.setClassid("other"); + dsType.setClassname("other"); + } + metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", dsType)); + } + + private String mapRelation(Tuple2 link, TemplateFactory templateFactory, Set contexts) { + final Relation rel = link.getRelation(); + final RelatedEntity re = link.getRelatedEntity(); + final String targetType = link.getRelatedEntity().getType(); + + final List metadata = Lists.newArrayList(); + switch (EntityType.valueOf(targetType)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { + metadata.add(XmlSerializationUtils.mapStructuredProperty("title", re.getTitle())); + } + if (isNotBlank(re.getDateofacceptance())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); + } + if (isNotBlank(re.getPublisher())) { + metadata.add(XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); + } + if (isNotBlank(re.getCodeRepositoryUrl())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); + } + if (re.getResulttype() != null & re.getResulttype().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); + } + if (re.getCollectedfrom() != null) { + metadata + .addAll( + re + .getCollectedfrom() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (re.getPid() != null) { + metadata + .addAll( + re + .getPid() + .stream() + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + break; + case datasource: + if (isNotBlank(re.getOfficialname())) { + metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); + } + if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + mapDatasourceType(metadata, re.getDatasourcetype()); + } + if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "openairecompatibility", re.getOpenairecompatibility())); + } + break; + case organization: + if (isNotBlank(re.getLegalname())) { + metadata.add(XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); + } + if (isNotBlank(re.getLegalshortname())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); + } + if (re.getCountry() != null & !re.getCountry().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); + } + break; + case project: + if (isNotBlank(re.getProjectTitle())) { + metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); + } + if (isNotBlank(re.getCode())) { + metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode())); + } + if (isNotBlank(re.getAcronym())) { + metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); + } + if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); + } + if (re.getFundingtree() != null & contexts != null) { + metadata + .addAll( + re + .getFundingtree() + .stream() + .peek(ft -> fillContextMap(ft, contexts)) + .map(ft -> getRelFundingTree(ft)) + .collect(Collectors.toList())); + } + break; + default: + throw new IllegalArgumentException("invalid target type: " + targetType); + } + final DataInfo info = rel.getDataInfo(); + final String scheme = ModelSupport.getScheme(re.getType(), targetType); + + if (StringUtils.isBlank(scheme)) { + throw new IllegalArgumentException( + String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); + } + + final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(1); + } + + return templateFactory + .getRel( + targetType, rel.getTarget(), Sets.newHashSet(metadata), rel.getRelClass(), scheme, info); + } + + private List listChildren( + final OafEntity entity, JoinedEntity je, TemplateFactory templateFactory) { + + final List children = Lists.newArrayList(); + EntityType entityType = EntityType.valueOf(je.getEntity().getType()); + + children + .addAll( + je + .getLinks() + .stream() + .filter(link -> REL_SUBTYPE_DEDUP.equalsIgnoreCase(link.getRelation().getSubRelType())) + .map(link -> mapRelation(link, templateFactory, null)) + .collect(Collectors.toCollection(ArrayList::new))); + + if (MainEntityType.result.toString().equals(ModelSupport.getMainType(entityType))) { + final List instances = ((Result) entity).getInstance(); + if (instances != null) { + for (final Instance instance : ((Result) entity).getInstance()) { + + final List fields = Lists.newArrayList(); + + if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { + fields + .add( + XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); + } + if (instance.getCollectedfrom() != null) { + fields + .add( + XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); + } + if (instance.getHostedby() != null) { + fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); + } + if (instance.getDateofacceptance() != null + && isNotBlank(instance.getDateofacceptance().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "dateofacceptance", instance.getDateofacceptance().getValue())); + } + if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { + fields + .add( + XmlSerializationUtils.mapQualifier("instancetype", instance.getInstancetype())); + } + if (isNotBlank(instance.getDistributionlocation())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "distributionlocation", instance.getDistributionlocation())); + } + if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { + fields + .add( + XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); + } + if (instance.getProcessingchargeamount() != null + && isNotBlank(instance.getProcessingchargeamount().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "processingchargeamount", instance.getProcessingchargeamount().getValue())); + } + if (instance.getProcessingchargecurrency() != null + && isNotBlank(instance.getProcessingchargecurrency().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "processingchargecurrency", instance.getProcessingchargecurrency().getValue())); + } + + children + .add( + templateFactory + .getInstance( + instance.getHostedby().getKey(), fields, instance.getUrl())); + } + } + final List ext = ((Result) entity).getExternalReference(); + if (ext != null) { + for (final ExternalReference er : ((Result) entity).getExternalReference()) { + + final List fields = Lists.newArrayList(); + + if (isNotBlank(er.getSitename())) { + fields.add(XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); + } + if (isNotBlank(er.getLabel())) { + fields.add(XmlSerializationUtils.asXmlElement("label", er.getLabel())); + } + if (isNotBlank(er.getUrl())) { + fields.add(XmlSerializationUtils.asXmlElement("url", er.getUrl())); + } + if (isNotBlank(er.getDescription())) { + fields.add(XmlSerializationUtils.asXmlElement("description", er.getDescription())); + } + if (isNotBlank(er.getUrl())) { + fields.add(XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); + } + if (isNotBlank(er.getRefidentifier())) { + fields.add(XmlSerializationUtils.asXmlElement("refidentifier", er.getRefidentifier())); + } + if (isNotBlank(er.getQuery())) { + fields.add(XmlSerializationUtils.asXmlElement("query", er.getQuery())); + } + + children.add(templateFactory.getChild("externalreference", null, fields)); + } + } + } + + return children; + } + + private List listExtraInfo(OafEntity entity) { + final List extraInfo = entity.getExtraInfo(); + return extraInfo != null + ? extraInfo + .stream() + .map(e -> XmlSerializationUtils.mapExtraInfo(e)) + .collect(Collectors.toList()) + : Lists.newArrayList(); + } + + private List buildContexts(final String type, final Set contexts) { + final List res = Lists.newArrayList(); + + if ((contextMapper != null) + && !contextMapper.isEmpty() + && MainEntityType.result.toString().equals(type)) { + + XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); + + for (final String context : contexts) { + + String id = ""; + for (final String token : Splitter.on("::").split(context)) { + id += token; + + final ContextDef def = contextMapper.get(id); + + if (def == null) { + continue; + // throw new IllegalStateException(String.format("cannot find context for id + // '%s'", + // id)); + } + + if (def.getName().equals("context")) { + final String xpath = "//context/@id='" + def.getId() + "'"; + if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { + document = addContextDef(document.gotoRoot(), def); + } + } + + if (def.getName().equals("category")) { + final String rootId = substringBefore(def.getId(), "::"); + document = addContextDef( + document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), + def); + } + + if (def.getName().equals("concept")) { + document = addContextDef(document, def).gotoParent(); + } + id += "::"; + } + } + final Transformer transformer = getTransformer(); + for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { + try { + res.add(asStringElement(x, transformer)); + } catch (final TransformerException e) { + throw new RuntimeException(e); + } + } + } + + return res; + } + + private Transformer getTransformer() { + try { + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + return transformer; + } catch (TransformerConfigurationException e) { + throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); + } + } + + private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { + tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); + if ((def.getType() != null) && !def.getType().isEmpty()) { + tag.addAttribute("type", def.getType()); + } + return tag; + } + + private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) + throws TransformerException { + final StringWriter buffer = new StringWriter(); + transformer.transform(new DOMSource(element), new StreamResult(buffer)); + return buffer.toString(); + } + + private void fillContextMap(final String xmlTree, final Set contexts) { + + Document fundingPath; + try { + fundingPath = new SAXReader().read(new StringReader(xmlTree)); + } catch (final DocumentException e) { + throw new RuntimeException(e); + } + try { + final Node funder = fundingPath.selectSingleNode("//funder"); + + if (funder != null) { + + final String funderShortName = funder.valueOf("./shortname"); + contexts.add(funderShortName); + + contextMapper + .put( + funderShortName, + new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); + final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); + if (level0 != null) { + final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); + contextMapper + .put( + level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); + final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); + if (level1 == null) { + contexts.add(level0Id); + } else { + final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); + contextMapper + .put( + level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); + final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); + if (level2 == null) { + contexts.add(level1Id); + } else { + final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); + contextMapper + .put( + level2Id, + new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); + contexts.add(level2Id); + } + } + } + } + } catch (final NullPointerException e) { + throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); + } + } + + @SuppressWarnings("unchecked") + protected static String getRelFundingTree(final String xmlTree) { + String funding = ""; + try { + final Document ftree = new SAXReader().read(new StringReader(xmlTree)); + funding = ""; + + funding += getFunderElement(ftree); + + for (final Object o : Lists + .reverse( + ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { + final Element e = (Element) o; + final String _id = e.valueOf("./id"); + funding += "<" + + e.getName() + + " name=\"" + + XmlSerializationUtils.escapeXml(e.valueOf("./name")) + + "\">" + + XmlSerializationUtils.escapeXml(_id) + + ""; + } + } catch (final DocumentException e) { + throw new IllegalArgumentException( + "unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); + } finally { + funding += ""; + } + return funding; + } + + private static String getFunderElement(final Document ftree) { + final String funderId = ftree.valueOf("//fundingtree/funder/id"); + final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); + final String funderName = ftree.valueOf("//fundingtree/funder/name"); + final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); + + return ""; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index 0b3109bdef..bc3b3107d8 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; @@ -8,149 +9,151 @@ import eu.dnetlib.dhp.schema.oaf.*; public class XmlSerializationUtils { - // XML 1.0 - // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - private static final String xml10pattern = - "[^" + "\u0009\r\n" + "\u0020-\uD7FF" + "\uE000-\uFFFD" + "\ud800\udc00-\udbff\udfff" + "]"; + // XML 1.0 + // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + private static final String xml10pattern = "[^" + "\u0009\r\n" + "\u0020-\uD7FF" + "\uE000-\uFFFD" + + "\ud800\udc00-\udbff\udfff" + "]"; - public static String mapJournal(Journal j) { - final String attrs = - new StringBuilder() - .append(attr("issn", j.getIssnPrinted())) - .append(attr("eissn", j.getIssnOnline())) - .append(attr("lissn", j.getIssnLinking())) - .append(attr("ep", j.getEp())) - .append(attr("iss", j.getIss())) - .append(attr("sp", j.getSp())) - .append(attr("vol", j.getVol())) - .toString() - .trim(); + public static String mapJournal(Journal j) { + final String attrs = new StringBuilder() + .append(attr("issn", j.getIssnPrinted())) + .append(attr("eissn", j.getIssnOnline())) + .append(attr("lissn", j.getIssnLinking())) + .append(attr("ep", j.getEp())) + .append(attr("iss", j.getIss())) + .append(attr("sp", j.getSp())) + .append(attr("vol", j.getVol())) + .toString() + .trim(); - return new StringBuilder() - .append("") - .append(escapeXml(j.getName())) - .append("") - .toString(); - } + return new StringBuilder() + .append("") + .append(escapeXml(j.getName())) + .append("") + .toString(); + } - private static String attr(final String name, final String value) { - return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; - } + private static String attr(final String name, final String value) { + return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; + } - public static String mapStructuredProperty(String name, StructuredProperty t) { - return asXmlElement( - name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); - } + public static String mapStructuredProperty(String name, StructuredProperty t) { + return asXmlElement( + name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); + } - public static String mapQualifier(String name, Qualifier q) { - return asXmlElement(name, "", q, null); - } + public static String mapQualifier(String name, Qualifier q) { + return asXmlElement(name, "", q, null); + } - public static String escapeXml(final String value) { - return value - .replaceAll("&", "&") - .replaceAll("<", "<") - .replaceAll(">", ">") - .replaceAll("\"", """) - .replaceAll("'", "'") - .replaceAll(xml10pattern, ""); - } + public static String escapeXml(final String value) { + return value + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("\"", """) + .replaceAll("'", "'") + .replaceAll(xml10pattern, ""); + } - public static String parseDataInfo(final DataInfo dataInfo) { - return new StringBuilder() - .append("") - .append(asXmlElement("inferred", dataInfo.getInferred() + "")) - .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) - .append(asXmlElement("trust", dataInfo.getTrust() + "")) - .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) - .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) - .append("") - .toString(); - } + public static String parseDataInfo(final DataInfo dataInfo) { + return new StringBuilder() + .append("") + .append(asXmlElement("inferred", dataInfo.getInferred() + "")) + .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) + .append(asXmlElement("trust", dataInfo.getTrust() + "")) + .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) + .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) + .append("") + .toString(); + } - private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { - return sb.append( - attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) - .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append( - attr( - "provenanceaction", - info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) - .append(attr("trust", info.getTrust())); - } + private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { + return sb + .append( + attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } - public static String mapKeyValue(final String name, final KeyValue kv) { - return new StringBuilder() - .append("<") - .append(name) - .append(" name=\"") - .append(escapeXml(kv.getValue())) - .append("\" id=\"") - .append(escapeXml(removePrefix(kv.getKey()))) - .append("\"/>") - .toString(); - } + public static String mapKeyValue(final String name, final KeyValue kv) { + return new StringBuilder() + .append("<") + .append(name) + .append(" name=\"") + .append(escapeXml(kv.getValue())) + .append("\" id=\"") + .append(escapeXml(removePrefix(kv.getKey()))) + .append("\"/>") + .toString(); + } - public static String mapExtraInfo(final ExtraInfo e) { - return new StringBuilder("") - .append(e.getValue()) - .append("") - .toString(); - } + public static String mapExtraInfo(final ExtraInfo e) { + return new StringBuilder("") + .append(e.getValue()) + .append("") + .toString(); + } - public static String asXmlElement(final String name, final String value) { - return asXmlElement(name, value, null, null); - } + public static String asXmlElement(final String name, final String value) { + return asXmlElement(name, value, null, null); + } - public static String asXmlElement( - final String name, final String value, final Qualifier q, final DataInfo info) { - StringBuilder sb = new StringBuilder(); - sb.append("<"); - sb.append(name); - if (q != null) { - sb.append(getAttributes(q)); - } - if (info != null) { - sb.append(" ") - .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) - .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append( - attr( - "provenanceaction", - info.getProvenanceaction() != null - ? info.getProvenanceaction().getClassid() - : "")) - .append(attr("trust", info.getTrust())); - } - if (isBlank(value)) { - sb.append("/>"); - return sb.toString(); - } + public static String asXmlElement( + final String name, final String value, final Qualifier q, final DataInfo info) { + StringBuilder sb = new StringBuilder(); + sb.append("<"); + sb.append(name); + if (q != null) { + sb.append(getAttributes(q)); + } + if (info != null) { + sb + .append(" ") + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null + ? info.getProvenanceaction().getClassid() + : "")) + .append(attr("trust", info.getTrust())); + } + if (isBlank(value)) { + sb.append("/>"); + return sb.toString(); + } - sb.append(">"); - sb.append(escapeXml(value)); - sb.append(""); + sb.append(">"); + sb.append(escapeXml(value)); + sb.append(""); - return sb.toString(); - } + return sb.toString(); + } - public static String getAttributes(final Qualifier q) { - if (q == null || q.isBlank()) return ""; + public static String getAttributes(final Qualifier q) { + if (q == null || q.isBlank()) + return ""; - return new StringBuilder(" ") - .append(attr("classid", q.getClassid())) - .append(attr("classname", q.getClassname())) - .append(attr("schemeid", q.getSchemeid())) - .append(attr("schemename", q.getSchemename())) - .toString(); - } + return new StringBuilder(" ") + .append(attr("classid", q.getClassid())) + .append(attr("classname", q.getClassname())) + .append(attr("schemeid", q.getSchemeid())) + .append(attr("schemename", q.getSchemename())) + .toString(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java index 21feb16379..8afe03d6d2 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java @@ -1,39 +1,42 @@ + package eu.dnetlib.dhp.oa.provision; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.junit.jupiter.api.BeforeEach; public class GraphJoinerTest { - private ClassLoader cl = getClass().getClassLoader(); - private Path workingDir; - private Path inputDir; - private Path outputDir; + private ClassLoader cl = getClass().getClassLoader(); + private Path workingDir; + private Path inputDir; + private Path outputDir; - @BeforeEach - public void before() throws IOException { - workingDir = Files.createTempDirectory("promote_action_set"); - inputDir = workingDir.resolve("input"); - outputDir = workingDir.resolve("output"); - } + @BeforeEach + public void before() throws IOException { + workingDir = Files.createTempDirectory("promote_action_set"); + inputDir = workingDir.resolve("input"); + outputDir = workingDir.resolve("output"); + } - private static void copyFiles(Path source, Path target) throws IOException { - Files.list(source) - .forEach( - f -> { - try { - if (Files.isDirectory(f)) { - Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); - copyFiles(f, subTarget); - } else { - Files.copy(f, target.resolve(f.getFileName())); - } - } catch (IOException e) { - e.printStackTrace(); - throw new RuntimeException(e); - } - }); - } + private static void copyFiles(Path source, Path target) throws IOException { + Files + .list(source) + .forEach( + f -> { + try { + if (Files.isDirectory(f)) { + Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); + copyFiles(f, subTarget); + } else { + Files.copy(f, target.resolve(f.getFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + } }