From 81f82b5d34d67d61017d546fe761daf0b717ca27 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Fri, 17 Jan 2020 15:26:21 +0100 Subject: [PATCH] partial implementation of applications to migrate entities --- .../migration/AbstractMigrateApplication.java | 61 +++ .../eu/dnetlib/dhp/migration/DbClient.java | 58 +++ .../dnetlib/dhp/migration/MdstoreClient.java | 87 ++++ .../MigrateDbEntitiesApplication.java | 390 ++++++++++++++++++ .../MigrateMongoMdstoresApplication.java | 190 +++++++++ .../dnetlib/dhp/migration/MigrationUtils.java | 164 ++++++++ .../migrate_db_entities_parameters.json | 38 ++ .../migrate_mongo_mstores_parameters.json | 50 +++ .../sql/queryDatasourceOrganization.sql | 16 + .../dhp/migration/sql/queryDatasources.sql | 147 +++++++ .../dhp/migration/sql/queryOrganizations.sql | 36 ++ .../sql/queryOrganizationsFromOpenOrgsDB.sql | 53 +++ .../sql/queryProjectOrganization.sql | 16 + .../dhp/migration/sql/queryProjects.sql | 87 ++++ .../sql/querySimilarityFromOpenOrgsDB.sql | 17 + 15 files changed, 1410 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java new file mode 100644 index 000000000..a5c8b2775 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMigrateApplication.java @@ -0,0 +1,61 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.codehaus.jackson.map.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class AbstractMigrateApplication implements Closeable { + + private final AtomicInteger counter = new AtomicInteger(0); + + private final IntWritable key = new IntWritable(counter.get()); + + private final Text value = new Text(); + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private final SequenceFile.Writer writer; + + public AbstractMigrateApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception { + this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer + .keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class)); + } + + private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException { + final Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + System.setProperty("HADOOP_USER_NAME", hdfsUser); + System.setProperty("hadoop.home.dir", "/"); + FileSystem.get(URI.create(hdfsNameNode), conf); + return conf; + } + + protected void emitOaf(final Oaf oaf) { + try { + key.set(counter.getAndIncrement()); + value.set(objectMapper.writeValueAsString(oaf)); + writer.append(key, value); + } catch (final Exception e) { + e.printStackTrace(); + } + } + + @Override + public void close() throws IOException { + writer.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java new file mode 100644 index 000000000..e9fee63b9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/DbClient.java @@ -0,0 +1,58 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.function.Consumer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class DbClient implements Closeable { + + private static final Log log = LogFactory.getLog(DbClient.class); + + private Connection connection; + + public DbClient(final String address, final String login, final String password) { + + try { + Class.forName("org.postgresql.Driver"); + this.connection = DriverManager.getConnection(address, login, password); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error(e.getClass().getName() + ": " + e.getMessage()); + throw new RuntimeException(e); + } + log.info("Opened database successfully"); + } + + public void processResults(final String sql, final Consumer consumer) { + + try (final Statement stmt = connection.createStatement()) { + try (final ResultSet rs = stmt.executeQuery("SELECT * FROM COMPANY;")) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } catch (final SQLException e1) { + throw new RuntimeException(e1); + } + } + + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java new file mode 100644 index 000000000..971d7f165 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MdstoreClient.java @@ -0,0 +1,87 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.bson.Document; + +import com.google.common.collect.Iterables; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +public class MdstoreClient implements Closeable { + + private final MongoClient client; + private final MongoDatabase db; + + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; + + private static final Log log = LogFactory.getLog(MdstoreClient.class); + + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } + + public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) { + + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } + + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA).find()) { + if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } + + return res; + } + + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } + + private MongoCollection getColl(final MongoDatabase db, final String collName) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + throw new RuntimeException(err); + } + return db.getCollection(collName); + } + + public Iterable listRecords(final String coll) { + return () -> StreamSupport.stream(getColl(db, coll).find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } + + @Override + public void close() throws IOException { + client.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java new file mode 100644 index 000000000..60a7c24f7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateDbEntitiesApplication.java @@ -0,0 +1,390 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.ResultSet; +import java.util.Arrays; +import java.util.function.Consumer; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class MigrateDbEntitiesApplication extends AbstractMigrateApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("namenode"); + final String hdfsUser = parser.get("hdfsUser"); + + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) { + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + smdbe.execute("queryProjects.sql", smdbe::processProject); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + + } + + public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + } + + public void execute(final String sqlFile, final Consumer consumer) throws Exception { + final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile)); + dbClient.processResults(sql, consumer); + } + + public void processDatasource(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Datasource ds = new Datasource(); + + ds.setId(MigrationUtils.createOpenaireId("10", rs.getString("datasourceid"))); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + ds.setPid(null); // List // TODO + ds.setDateofcollection(rs.getDate("dateofcollection").toString()); + ds.setDateoftransformation(null); // TODO + ds.setExtraInfo(null); // TODO + ds.setOaiprovenance(null); // TODO + + ds.setDatasourcetype(null); // Qualifier datasourcetype) { + ds.setOpenairecompatibility(null); // Qualifier openairecompatibility) { + ds.setOfficialname(MigrationUtils.field(rs.getString("officialname"), info)); + ds.setEnglishname(MigrationUtils.field(rs.getString("englishname"), info)); + ds.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); + ds.setLogourl(MigrationUtils.field(rs.getString("logourl"), info)); + ds.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(MigrationUtils.field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(MigrationUtils.field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(MigrationUtils.field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(MigrationUtils.field(rs.getDate("dateofvalidation").toString(), info)); + ds.setDescription(MigrationUtils.field(rs.getString("description"), info)); + ds.setSubjects(null); // List subjects) { + ds.setOdnumberofitems(MigrationUtils.field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(MigrationUtils.field(rs.getDate("odnumberofitemsdate").toString(), info)); + ds.setOdpolicies(MigrationUtils.field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(MigrationUtils.listFields(info, rs.getArray("odlanguages"))); + ds.setOdcontenttypes(MigrationUtils.listFields(info, rs.getArray("odcontenttypes"))); + ds.setAccessinfopackage(MigrationUtils.listFields(info, rs.getArray("accessinfopackage"))); + ds.setReleasestartdate(MigrationUtils.field(rs.getDate("releasestartdate").toString(), info)); + ds.setReleaseenddate(MigrationUtils.field(rs.getDate("releaseenddate").toString(), info)); + ds.setMissionstatementurl(MigrationUtils.field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(MigrationUtils.field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(MigrationUtils.field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(MigrationUtils.field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(MigrationUtils.field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(MigrationUtils.field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(MigrationUtils.field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(MigrationUtils.field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(MigrationUtils.field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(MigrationUtils.field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(MigrationUtils.field(rs.getString("pidsystems"), info)); + ds.setCertificates(MigrationUtils.field(rs.getString("certificates"), info)); + ds.setPolicies(null); // List // TODO + ds.setJournal(null); // Journal // TODO + + // rs.getString("datasourceid"); + rs.getArray("identities"); + // rs.getString("officialname"); + // rs.getString("englishname"); + // rs.getString("contactemail"); + rs.getString("openairecompatibility"); // COMPLEX ...@@@... + // rs.getString("websiteurl"); + // rs.getString("logourl"); + // rs.getArray("accessinfopackage"); + // rs.getDouble("latitude"); + // rs.getDouble("longitude"); + // rs.getString("namespaceprefix"); + // rs.getInt("odnumberofitems"); // NULL + // rs.getDate("odnumberofitemsdate"); // NULL + rs.getArray("subjects"); + // rs.getString("description"); + // rs.getString("odpolicies"); // NULL + // rs.getArray("odlanguages"); + // rs.getArray("odcontenttypes"); + rs.getBoolean("inferred"); // false + rs.getBoolean("deletedbyinference");// false + rs.getDouble("trust"); // 0.9 + rs.getString("inferenceprovenance"); // NULL + // rs.getDate("dateofcollection"); + // rs.getDate("dateofvalidation"); + // rs.getDate("releasestartdate"); + // rs.getDate("releaseenddate"); + // rs.getString("missionstatementurl"); + // rs.getBoolean("dataprovider"); + // rs.getBoolean("serviceprovider"); + // rs.getString("databaseaccesstype"); + // rs.getString("datauploadtype"); + // rs.getString("databaseaccessrestriction"); + // rs.getString("datauploadrestriction"); + // rs.getBoolean("versioning"); + // rs.getString("citationguidelineurl"); + // rs.getString("qualitymanagementkind"); + // rs.getString("pidsystems"); + // rs.getString("certificates"); + rs.getArray("policies"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("datasourcetype"); // COMPLEX XXX@@@@.... + rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction, + rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + + emitOaf(ds); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processProject(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Project p = new Project(); + + p.setId(MigrationUtils.createOpenaireId("40", rs.getString("projectid"))); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + p.setPid(null); // List // TODO + + p.setDateofcollection(rs.getDate("dateofcollection").toString()); + p.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + p.setExtraInfo(null); // List //TODO + p.setOaiprovenance(null); // OAIProvenance /TODO + + p.setWebsiteurl(MigrationUtils.field(rs.getString("websiteurl"), info)); + p.setCode(MigrationUtils.field(rs.getString("code"), info)); + p.setAcronym(MigrationUtils.field(rs.getString("acronym"), info)); + p.setTitle(MigrationUtils.field(rs.getString("title"), info)); + p.setStartdate(MigrationUtils.field(rs.getDate("startdate").toString(), info)); + p.setEnddate(MigrationUtils.field(rs.getDate("enddate").toString(), info)); + p.setCallidentifier(MigrationUtils.field(rs.getString("callidentifier"), info)); + p.setKeywords(MigrationUtils.field(rs.getString("keywords"), info)); + p.setDuration(MigrationUtils.field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications(MigrationUtils.field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(null); // List //TODO + p.setFundingtree(null); // List> //TODO + p.setContracttype(null); // Qualifier //TODO + p.setOptional1(MigrationUtils.field(rs.getString("optional1"), info)); + p.setOptional2(MigrationUtils.field(rs.getString("optional2"), info)); + p.setJsonextrainfo(MigrationUtils.field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(MigrationUtils.field(rs.getString("contactfullname"), info)); + p.setContactfax(MigrationUtils.field(rs.getString("contactfax"), info)); + p.setContactphone(MigrationUtils.field(rs.getString("contactphone"), info)); + p.setContactemail(MigrationUtils.field(rs.getString("contactemail"), info)); + p.setSummary(MigrationUtils.field(rs.getString("summary"), info)); + p.setCurrency(MigrationUtils.field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + + // rs.getString("projectid"); + // rs.getString("code"); + // rs.getString("websiteurl"); + // rs.getString("acronym"); + // rs.getString("title"); + // rs.getDate("startdate"); + // rs.getDate("enddate"); + // rs.getString("callidentifier"); + // rs.getString("keywords"); + // rs.getInt("duration"); + // rs.getBoolean("ecsc39"); + // rs.getBoolean("oamandatepublications"); + // rs.getBoolean("ecarticle29_3"); + // rs.getDate("dateofcollection"); + // rs.getDate("dateoftransformation"); + rs.getBoolean("inferred"); + rs.getBoolean("deletedbyinference"); + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); + // rs.getString("optional1"); + // rs.getString("optional2"); + rs.getString("jsonextrainfo"); + // rs.getString("contactfullname"); + // rs.getString("contactfax"); + // rs.getString("contactphone"); + // rs.getString("contactemail"); + // rs.getString("summary"); + // rs.getString("currency"); + // rs.getDouble("totalcost"); + // rs.getDouble("fundedamount"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("contracttype"); // COMPLEX + rs.getString("provenanceaction"); // COMPLEX + rs.getArray("pid"); + rs.getArray("subjects"); + rs.getArray("fundingtree"); + + emitOaf(p); + + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processOrganization(final ResultSet rs) { + try { + + final DataInfo info = MigrationUtils.dataInfo(null, null, null, null, null, null); // TODO + + final Organization o = new Organization(); + + o.setId(MigrationUtils.createOpenaireId("20", rs.getString("organizationid"))); // String id) { + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o.setCollectedfrom(MigrationUtils.listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + o.setPid(null); // List // TODO + o.setDateofcollection(rs.getDate("dateofcollection").toString()); + o.setDateoftransformation(rs.getDate("dateoftransformation").toString()); + o.setExtraInfo(null); // List // TODO + o.setOaiprovenance(null); // OAIProvenance // TODO + o.setLegalshortname(MigrationUtils.field("legalshortname", info)); + o.setLegalname(MigrationUtils.field("legalname", info)); + o.setAlternativeNames(null); // List> //TODO + o.setWebsiteurl(MigrationUtils.field("websiteurl", info)); + o.setLogourl(MigrationUtils.field("logourl", info)); + o.setEclegalbody(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(MigrationUtils.field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation(MigrationUtils.field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests(MigrationUtils + .field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); + o.setEcinternationalorganization(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(MigrationUtils.field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(null); // Qualifier country) { + + // rs.getString("organizationid"); + // rs.getString("legalshortname"); + // rs.getString("legalname"); + // rs.getString("websiteurl"); + // rs.getString("logourl"); + // rs.getBoolean("eclegalbody"); + // rs.getBoolean("eclegalperson"); + // rs.getBoolean("ecnonprofit"); + // rs.getBoolean("ecresearchorganization"); + // rs.getBoolean("echighereducation"); + // rs.getBoolean("ecinternationalorganizationeurinterests"); + // rs.getBoolean("ecinternationalorganization"); + // rs.getBoolean("ecenterprise"); + // rs.getBoolean("ecsmevalidated"); + // rs.getBoolean("ecnutscode"); + rs.getDate("dateofcollection"); + rs.getDate("dateoftransformation"); + rs.getBoolean("inferred"); + rs.getBoolean("deletedbyinference"); + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); + // rs.getString("collectedfromid"); + // rs.getString("collectedfromname"); + rs.getString("country"); + rs.getString("provenanceaction"); + rs.getArray("pid"); + + emitOaf(o); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processDatasourceOrganization(final ResultSet rs) { + + try { + final Relation r = new Relation(); + + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(MigrationUtils.listKeyValues("", "")); + + rs.getString("datasource"); + rs.getString("organization"); + rs.getDate("startdate"); // NULL + rs.getDate("enddate"); // NULL + rs.getBoolean("inferred"); // false + rs.getBoolean("deletedbyinference"); // false + rs.getDouble("trust"); // 0.9 + rs.getString("inferenceprovenance"); // NULL + rs.getString("semantics"); // 'providedBy@@@provided + // by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS + // semantics, + rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction || + // '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + + emitOaf(r); + } catch (final Exception e) { + // TODO: handle exception + } + } + + public void processProjectOrganization(final ResultSet rs) { + try { + final Relation r = new Relation(); + + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(null); + + rs.getString("project"); + rs.getString("resporganization"); + rs.getInt("participantnumber"); + rs.getDouble("contribution"); + rs.getDate("startdate");// null + rs.getDate("enddate");// null + rs.getBoolean("inferred");// false + rs.getBoolean("deletedbyinference"); // false + rs.getDouble("trust"); + rs.getString("inferenceprovenance"); // NULL + rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass || + // '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + rs.getString("provenanceaction"); // 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' + // AS provenanceaction + emitOaf(r); + } catch (final Exception e) { + // TODO: handle exception + } + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java new file mode 100644 index 000000000..cead2366b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrateMongoMdstoresApplication.java @@ -0,0 +1,190 @@ +package eu.dnetlib.dhp.migration; + +import java.io.Closeable; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class MigrateMongoMdstoresApplication extends AbstractMigrateApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + + private final MdstoreClient mdstoreClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); + + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); + + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); + + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("namenode"); + final String hdfsUser = parser.get("hdfsUser"); + + try (final MigrateMongoMdstoresApplication mig = new MigrateMongoMdstoresApplication(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb)) { + mig.processMdRecords(mdFormat, mdLayout, mdInterpretation); + } + + } + + public MigrateMongoMdstoresApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, + final String mongoDb) throws Exception { + super(hdfsPath, hdfsNameNode, hdfsUser); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + + } + + public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException { + + for (final Entry entry : mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation).entrySet()) { + // final String mdId = entry.getKey(); + final String currentColl = entry.getValue(); + + for (final String xml : mdstoreClient.listRecords(currentColl)) { + for (final Oaf oaf : createOafs(xml)) { + emitOaf(oaf); + } + } + } + } + + private List createOafs(final String xml) throws DocumentException { + final SAXReader reader = new SAXReader(); + final Document doc = reader.read(new StringReader(xml)); + + final String type = doc.valueOf(""); // TODO + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc); + p.setJournal(null); // TODO + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc); + d.setStoragedate(null); // TODO + d.setDevice(null); // TODO + d.setSize(null); // TODO + d.setVersion(null); // TODO + d.setLastmetadataupdate(null); // TODO + d.setMetadataversionnumber(null); // TODO + d.setGeolocation(null); // TODO + oafs.add(d); + break; + case "otherresearchproducts": + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc); + o.setContactperson(null); // TODO + o.setContactgroup(null); // TODO + o.setTool(null); // TODO + oafs.add(o); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc); + s.setDocumentationUrl(null); // TODO + s.setLicense(null); // TODO + s.setCodeRepositoryUrl(null); // TODO + s.setProgrammingLanguage(null); // TODO + oafs.add(s); + break; + default: + log.error("Inavlid type: " + type); + break; + } + + if (!oafs.isEmpty()) { + addRelations(oafs, doc, "//*", "TYPE"); + addRelations(oafs, doc, "//*", "TYPE"); + addRelations(oafs, doc, "//*", "TYPE"); + } + + return oafs; + } + + private void addRelations(final List oafs, final Document doc, final String xpath, final String type) { + for (final Object o : doc.selectNodes(xpath)) { + final Node n = (Node) o; + final Relation r = new Relation(); + r.setRelType(null); // TODO + r.setSubRelType(null); // TODO + r.setRelClass(null); // TODO + r.setSource(null); // TODO + r.setTarget(null); // TODO + r.setCollectedFrom(null); // TODO + oafs.add(r); + } + + } + + private void populateResultFields(final Result r, final Document doc) { + r.setDataInfo(null); // TODO + r.setLastupdatetimestamp(null); // TODO + r.setId(null); // TODO + r.setOriginalId(null); // TODO + r.setCollectedfrom(null); // TODO + r.setPid(null); // TODO + r.setDateofcollection(null); // TODO + r.setDateoftransformation(null); // TODO + r.setExtraInfo(null); // TODO + r.setOaiprovenance(null); // TODO + r.setAuthor(null); // TODO + r.setResulttype(null); // TODO + r.setLanguage(null); // TODO + r.setCountry(null); // TODO + r.setSubject(null); // TODO + r.setTitle(null); // TODO + r.setRelevantdate(null); // TODO + r.setDescription(null); // TODO + r.setDateofacceptance(null); // TODO + r.setPublisher(null); // TODO + r.setEmbargoenddate(null); // TODO + r.setSource(null); // TODO + r.setFulltext(null); // TODO + r.setFormat(null); // TODO + r.setContributor(null); // TODO + r.setResourcetype(null); // TODO + r.setCoverage(null); // TODO + r.setRefereed(null); // TODO + r.setContext(null); // TODO + r.setExternalReference(null); // TODO + r.setInstance(null); // TODO + r.setProcessingchargeamount(null); // TODO + r.setProcessingchargecurrency(null); // TODO + } + + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java new file mode 100644 index 000000000..8346a8041 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/MigrationUtils.java @@ -0,0 +1,164 @@ +package eu.dnetlib.dhp.migration; + +import java.sql.Array; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class MigrationUtils { + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays.stream(values).map(v -> field(v, info)).collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final Array array) { + try { + return listFields(info, (String[]) array.getArray()); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static StructuredProperty structuredProperty(final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier(classid, classname, schemeid, schemename)); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance(final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal(final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } + + public static DataInfo dataInfo(final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId(final String prefix, final String originalId) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json new file mode 100644 index 000000000..861d297ba --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "hdfsUser", + "paramDescription": "the user wich create the hdfs seq file", + "paramRequired": true + }, + { + "paramName": "dburl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "dbuser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": true + }, + { + "paramName": "dbpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json new file mode 100644 index 000000000..fb5736dc0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -0,0 +1,50 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "hdfsUser", + "paramDescription": "the user wich create the hdfs seq file", + "paramRequired": true + }, + { + "paramName": "mongourl", + "paramLongName": "mongoBaseUrl", + "paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "mongoDb", + "paramDescription": "mongo database", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "mdFormat", + "paramDescription": "metadata format", + "paramRequired": true + }, + { + "paramName": "l", + "paramLongName": "mdLayout", + "paramDescription": "metadata layout", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "mdInterpretation", + "paramDescription": "metadata interpretation", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql new file mode 100644 index 000000000..885b6ae09 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql @@ -0,0 +1,16 @@ +SELECT + dor.datasource AS datasource, + dor.organization AS organization, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + + 'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics, + d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + +FROM dsm_datasource_organization dor + LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id) + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql new file mode 100644 index 000000000..8c587f34e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql @@ -0,0 +1,147 @@ +SELECT + d.id AS datasourceid, + d.id || array_agg(distinct di.pid) AS identities, + d.officialname AS officialname, + d.englishname AS englishname, + d.contactemail AS contactemail, + CASE + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1']) + THEN + 'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0']) + THEN + 'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver']) + THEN + 'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0']) + THEN + 'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0']) + THEN + 'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data']) + THEN + 'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native']) + THEN + 'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy']) + THEN + 'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible']) + THEN + 'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + ELSE + 'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + END AS openairecompatibility, + d.websiteurl AS websiteurl, + d.logourl AS logourl, + array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage, + d.latitude AS latitude, + d.longitude AS longitude, + d.namespaceprefix AS namespaceprefix, + NULL AS odnumberofitems, + NULL AS odnumberofitemsdate, + + (SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies') + FROM UNNEST( + ARRAY( + SELECT trim(s) + FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects, + + d.description AS description, + NULL AS odpolicies, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + d.dateofcollection AS dateofcollection, + d.dateofvalidation AS dateofvalidation, + -- re3data fields + d.releasestartdate AS releasestartdate, + d.releaseenddate AS releaseenddate, + d.missionstatementurl AS missionstatementurl, + d.dataprovider AS dataprovider, + d.serviceprovider AS serviceprovider, + d.databaseaccesstype AS databaseaccesstype, + d.datauploadtype AS datauploadtype, + d.databaseaccessrestriction AS databaseaccessrestriction, + d.datauploadrestriction AS datauploadrestriction, + d.versioning AS versioning, + d.citationguidelineurl AS citationguidelineurl, + d.qualitymanagementkind AS qualitymanagementkind, + d.pidsystems AS pidsystems, + d.certificates AS certificates, + ARRAY[]::text[] AS policies, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + d.typology || '@@@' || CASE + WHEN (d.typology = 'crissystem') THEN 'CRIS System' + WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository' + WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator' + WHEN (d.typology = 'infospace') THEN 'Information Space' + WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository' + WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator' + WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal' + WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher' + WHEN (d.typology = 'pubsrepository::mock') THEN 'Other' + WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue' + WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository' + WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator' + WHEN (d.typology = 'entityregistry') THEN 'Registry' + WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure' + WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository' + WHEN (d.typology = 'websource') THEN 'Web Source' + WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database' + WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories' + WHEN (d.typology = 'softwarerepository') THEN 'Software Repository' + WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator' + WHEN (d.typology = 'orprepository') THEN 'Repository' + ELSE 'Other' + END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + +FROM dsm_datasources d + +LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id) +LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource) +LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource) + +GROUP BY + d.id, + d.officialname, + d.englishname, + d.websiteurl, + d.logourl, + d.contactemail, + d.namespaceprefix, + d.description, + d.latitude, + d.longitude, + d.dateofcollection, + d.dateofvalidation, + d.releasestartdate, + d.releaseenddate, + d.missionstatementurl, + d.dataprovider, + d.serviceprovider, + d.databaseaccesstype, + d.datauploadtype, + d.databaseaccessrestriction, + d.datauploadrestriction, + d.versioning, + d.citationguidelineurl, + d.qualitymanagementkind, + d.pidsystems, + d.certificates, + dc.id, + dc.officialname, + d.issn, + d.eissn, + d.lissn diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql new file mode 100644 index 000000000..682ca3596 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql @@ -0,0 +1,36 @@ +SELECT + o.id AS organizationid, + o.legalshortname AS legalshortname, + o.legalname AS legalname, + o.websiteurl AS websiteurl, + o.logourl AS logourl, + o.ec_legalbody AS eclegalbody, + o.ec_legalperson AS eclegalperson, + o.ec_nonprofit AS ecnonprofit, + o.ec_researchorganization AS ecresearchorganization, + o.ec_highereducation AS echighereducation, + o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests, + o.ec_internationalorganization AS ecinternationalorganization, + o.ec_enterprise AS ecenterprise, + o.ec_smevalidated AS ecsmevalidated, + o.ec_nutscode AS ecnutscode, + o.dateofcollection AS dateofcollection, + o.lastupdate AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + o.trust AS trust, + '' AS inferenceprovenance, + d.id AS collectedfromid, + d.officialname AS collectedfromname, + + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + + ARRAY[]::text[] AS pid +FROM dsm_organizations o + LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql new file mode 100644 index 000000000..dc9550883 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql @@ -0,0 +1,53 @@ +SELECT + o.id AS organizationid, + coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname, + o.name AS legalname, + array_agg(DISTINCT n.name) AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.95 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM organizations o + LEFT OUTER JOIN acronyms a ON (a.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) + LEFT OUTER JOIN other_names n ON (n.id = o.id) +GROUP BY + o.id, + o.name, + o.modification_date, + o.country + +UNION ALL + +SELECT + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid, + n.name AS legalshortname, + n.name AS legalname, + ARRAY[]::text[] AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.88 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) +GROUP BY + o.id, o.modification_date, o.country, n.name + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql new file mode 100644 index 000000000..4483d6145 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql @@ -0,0 +1,16 @@ +SELECT + po.project AS project, + po.resporganization AS resporganization, + po.participantnumber AS participantnumber, + po.contribution AS contribution, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + po.trust AS trust, + NULL AS inferenceprovenance, + + po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction + +FROM project_organization po diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql new file mode 100644 index 000000000..f04f1f03b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql @@ -0,0 +1,87 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.contracttype, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql new file mode 100644 index 000000000..4407559c6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql @@ -0,0 +1,17 @@ +SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar' + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2 +FROM acronyms a + LEFT OUTER JOIN organizations o ON (a.id = o.id) + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2 +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id)