re-packing all the D4I applications

This commit is contained in:
Claudio Atzori 2022-02-15 11:17:47 +01:00
parent eaefc4d6d9
commit 2905051469
391 changed files with 17043 additions and 0 deletions

View File

@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib</groupId>
<artifactId>data4impact-api-application</artifactId>
<version>1.1.0-SNAPSHOT</version>
<!-- <scm>
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/data4impact/data4impact-api-application/trunk</developerConnection>
</scm>
<ciManagement>
<system>jenkins</system>
<url>https://jenkins-dnet.d4science.org/view/data4impact/job/data4impact-api-application/</url>
</ciManagement>
<distributionManagement>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
</repository>
</distributionManagement>
-->
<!-- Inherit defaults from Spring Boot -->
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.3.RELEASE</version>
<relativePath></relativePath>
</parent>
<!--
<repositories>
<repository>
<id>dnet-deps</id>
<name>dnet-dependencies</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
<layout>default</layout>
</repository>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>dnet45-snapshots</id>
<name>D-Net 45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
-->
<!-- Add typical dependencies for a web application -->
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>data4impact-model</artifactId>
<version>1.1.0-SNAPSHOT</version>
</dependency>
<!-- Swagger -->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger2</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger-ui</artifactId>
<version>2.9.2</version>
</dependency>
<!-- JUnit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<executable>true</executable>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<java.version>1.8</java.version>
<apache.solr.version>7.1.0</apache.solr.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<springfox-version>2.8.0</springfox-version>
<prometheus.version>0.2.0</prometheus.version>
<javamelody.version>1.71.0</javamelody.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
</properties>
</project>

View File

@ -0,0 +1,47 @@
package eu.data4impact;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.context.annotation.Bean;
import springfox.documentation.builders.ApiInfoBuilder;
import springfox.documentation.builders.RequestHandlerSelectors;
import springfox.documentation.service.ApiInfo;
import springfox.documentation.spi.DocumentationType;
import springfox.documentation.spring.web.plugins.Docket;
import springfox.documentation.swagger2.annotations.EnableSwagger2;
@SpringBootApplication
@EnableSwagger2
@EnableCaching
public class MainApplication {
private static final Logger log = LoggerFactory.getLogger(MainApplication.class);
public static void main(final String[] args) {
SpringApplication.run(MainApplication.class, args);
}
@Bean
public static Docket newSwaggerDocket() {
log.info("Initializing SWAGGER...");
return new Docket(DocumentationType.SWAGGER_2)
.select()
.apis(RequestHandlerSelectors.any())
.paths(p -> p.startsWith("/api/"))
.build().apiInfo((new ApiInfoBuilder())
.title("Data4impact Service APIs")
.description("APIs documentation")
.version("1.1")
.contact(ApiInfo.DEFAULT_CONTACT)
.license("Apache 2.0")
.licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
.build());
}
}

View File

@ -0,0 +1,14 @@
package eu.data4impact;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
@Controller
public class SwaggerController {
@RequestMapping(value = { "/", "/apidoc", "/api-doc", "/doc", "/swagger" }, method = RequestMethod.GET)
public String apiDoc() {
return "redirect:swagger-ui.html";
}
}

View File

@ -0,0 +1,38 @@
package eu.data4impact.controller;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import eu.data4impact.utils.MainEntity;
public abstract class AbstractJpaController<T extends MainEntity> {
public abstract JpaRepository<T, String> getRepo();
@RequestMapping(value = "/list/{page}/{size}", method = RequestMethod.GET)
public final List<T> find(@PathVariable final int page, @PathVariable final int size) {
return getRepo().findAll(PageRequest.of(page, size)).getContent();
}
@RequestMapping(value = "/identifiers/{page}/{size}", method = RequestMethod.GET)
public final List<String> findIdentifiers(@PathVariable final int page, @PathVariable final int size) {
return find(page, size).stream().map(MainEntity::getId).collect(Collectors.toList());
}
@RequestMapping(value = "/get", method = RequestMethod.GET)
public final T get(@RequestParam final String id) {
return getRepo().findById(id).orElse(null);
}
@RequestMapping(value = "/count", method = RequestMethod.GET)
public final long count() {
return getRepo().count();
}
}

View File

@ -0,0 +1,38 @@
package eu.data4impact.controller;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.data.domain.PageRequest;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import eu.data4impact.utils.MainEntity;
import eu.data4impact.utils.ReadOnlyRepository;
public abstract class AbstractReadOnlyController<T extends MainEntity> {
public abstract ReadOnlyRepository<T, String> getRepo();
@RequestMapping(value = "/list/{page}/{size}", method = RequestMethod.GET)
public final List<T> find(@PathVariable final int page, @PathVariable final int size) {
return getRepo().findAll(PageRequest.of(page, size)).getContent();
}
@RequestMapping(value = "/identifiers/{page}/{size}", method = RequestMethod.GET)
public final List<String> findIdentifiers(@PathVariable final int page, @PathVariable final int size) {
return find(page, size).stream().map(MainEntity::getId).collect(Collectors.toList());
}
@RequestMapping(value = "/get", method = RequestMethod.GET)
public final T get(@RequestParam final String id) {
return getRepo().findById(id).orElse(null);
}
@RequestMapping(value = "/count", method = RequestMethod.GET)
public final long count() {
return getRepo().count();
}
}

View File

@ -0,0 +1,40 @@
package eu.data4impact.controller;
import java.util.Arrays;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.CacheEvict;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.utils.Counter;
import eu.data4impact.utils.DatabaseUtils;
@RestController
@RequestMapping("/admin")
public class AdminController {
@Autowired
private DatabaseUtils databaseUtils;
@RequestMapping(value = "/materializedViews", method = RequestMethod.GET)
public List<String> materializedViews(@RequestParam(required = false, defaultValue = "false") final boolean refresh) {
return refresh ? databaseUtils.refreshMaterializedViews() : databaseUtils.materializedViews();
}
@Cacheable(value = "simpleCache", key = "'tables'")
@RequestMapping(value = "/tables", method = RequestMethod.GET)
public List<Counter> tables() {
return databaseUtils.tableSizes();
}
@CacheEvict(cacheNames = { "simpleCache" }, allEntries = true)
@RequestMapping(value = "/clearCaches", method = RequestMethod.GET)
public List<String> clearCaches() {
return Arrays.asList("Done.");
}
}

View File

@ -0,0 +1,51 @@
package eu.data4impact.controller;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.model.documents.DocFulltext;
import eu.data4impact.model.documents.Document;
import eu.data4impact.repository.DocFulltextRepository;
import eu.data4impact.repository.DocumentRepository;
@RestController
@RequestMapping("/api/docs")
public class DocumentController extends AbstractJpaController<Document> {
@Autowired
private DocumentRepository documentRepository;
@Autowired
private DocFulltextRepository docFulltextRepository;
@Override
public JpaRepository<Document, String> getRepo() {
return documentRepository;
}
@RequestMapping(value = "/fulltext", method = RequestMethod.GET, produces = "text/plain")
public String fulltext(@RequestParam final String id) {
return docFulltextRepository.findById(id).map(DocFulltext::getFulltext).orElse("");
}
@RequestMapping(value = "/byType/{type}/{page}/{size}", method = RequestMethod.GET)
public List<Document> findByType(@PathVariable final String type, @PathVariable final int page, @PathVariable final int size) {
return documentRepository.findByType(type, PageRequest.of(page, size)).getContent();
}
@RequestMapping(value = "/types", method = RequestMethod.GET)
public Map<String, Long> types() {
return documentRepository.types().stream().collect(Collectors.toMap(s -> s, s -> documentRepository.countByType(s)));
}
}

View File

@ -0,0 +1,23 @@
package eu.data4impact.controller;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.model.journals.Journal;
import eu.data4impact.repository.JournalRepository;
@RestController
@RequestMapping("/api/journals")
public class JournalController extends AbstractJpaController<Journal> {
@Autowired
private JournalRepository journalRepository;
@Override
public JpaRepository<Journal, String> getRepo() {
return journalRepository;
}
}

View File

@ -0,0 +1,43 @@
package eu.data4impact.controller;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.repository.readonly.OrganizationViewRepository;
import eu.data4impact.utils.ReadOnlyRepository;
import eu.data4impact.views.OrganizationView;
@RestController
@RequestMapping("/api/organizations")
public class OrganizationController extends AbstractReadOnlyController<OrganizationView> {
@Autowired
private OrganizationViewRepository organizationViewRepository;
@Override
public ReadOnlyRepository<OrganizationView, String> getRepo() {
return organizationViewRepository;
}
@RequestMapping(value = "/companies/{page}/{size}", method = RequestMethod.GET)
public List<OrganizationView> findCompanies(@PathVariable final int page, @PathVariable final int size) {
return organizationViewRepository.findByCompany(true, PageRequest.of(page, size)).getContent();
}
@RequestMapping(value = "/summary", method = RequestMethod.GET)
public Map<String, Long> summary() {
final Map<String, Long> res = new LinkedHashMap<>();
res.put("all", organizationViewRepository.count());
res.put("companies", organizationViewRepository.countByCompany(true));
return res;
}
}

View File

@ -0,0 +1,76 @@
package eu.data4impact.controller;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.io.IOUtils;
import org.postgresql.util.Base64;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.model.projects.ProjectPortfolio;
import eu.data4impact.repository.ProjectPortfolioRepository;
import eu.data4impact.repository.readonly.ProjectViewRepository;
import eu.data4impact.utils.ReadOnlyRepository;
import eu.data4impact.views.ProjectView;
@RestController
@RequestMapping("/api/projects")
public class ProjectController extends AbstractReadOnlyController<ProjectView> {
@Autowired
private ProjectViewRepository projectRepository;
@Autowired
private ProjectPortfolioRepository projectPortfolioRepository;
@Override
public ReadOnlyRepository<ProjectView, String> getRepo() {
return projectRepository;
}
@RequestMapping(value = "/byFunder/{funder}/{page}/{size}", method = RequestMethod.GET)
public List<ProjectView> findByFunder(@PathVariable final String funder, @PathVariable final int page, @PathVariable final int size) {
return projectRepository.findByFunder(funder, PageRequest.of(page, size)).getContent();
}
@RequestMapping(value = "/funders", method = RequestMethod.GET)
public Map<String, Long> funders() {
return projectRepository.funders().stream().collect(Collectors.toMap(s -> s, s -> projectRepository.countByFunder(s)));
}
@RequestMapping(value = "/portfolio", method = RequestMethod.GET)
public final void getPortfolio(@RequestParam final String id, final HttpServletResponse res) throws IOException {
res.setContentType("application/json");
IOUtils.write(projectPortfolioRepository.findById(id)
.map(ProjectPortfolio::getPortfolio)
.map(Base64::decode)
.map(this::gunzip)
.orElse("{}"),
res.getOutputStream(),
Charset.defaultCharset());
}
private String gunzip(final byte[] bytes) {
try (final ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
final GZIPInputStream gis = new GZIPInputStream(bis)) {
return IOUtils.toString(gis, Charset.defaultCharset());
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,22 @@
package eu.data4impact.controller;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import eu.data4impact.model.topics.Topic;
import eu.data4impact.repository.TopicRepository;
@RestController
@RequestMapping("/api/topics")
public class TopicController extends AbstractJpaController<Topic> {
@Autowired
private TopicRepository topicRepository;
@Override
public JpaRepository<Topic, String> getRepo() {
return topicRepository;
}
}

View File

@ -0,0 +1,14 @@
spring.main.banner-mode = off
logging.level.root = INFO
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
spring.datasource.username=
spring.datasource.password=
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
# Hibernate ddl auto (create, create-drop, validate, update)
spring.jpa.hibernate.ddl-auto = validate
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
spring.jpa.open-in-view=true

View File

@ -0,0 +1,14 @@
spring.main.banner-mode = off
logging.level.root = INFO
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
spring.datasource.username=
spring.datasource.password=
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
# Hibernate ddl auto (create, create-drop, validate, update)
spring.jpa.hibernate.ddl-auto = validate
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
spring.jpa.open-in-view=true

View File

@ -0,0 +1,5 @@
eu/data4impact/controller/AbstractJpaController.class
eu/data4impact/controller/JournalController.class
eu/data4impact/controller/OrganizationController.class
eu/data4impact/controller/AbstractReadOnlyController.class
eu/data4impact/controller/AdminController.class

View File

@ -0,0 +1,10 @@
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/OrganizationController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AbstractReadOnlyController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AdminController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/JournalController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/MainApplication.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/TopicController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/ProjectController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AbstractJpaController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/SwaggerController.java
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/DocumentController.java

View File

@ -0,0 +1,3 @@
UPDATE public.Document
SET batchid = SUBSTR(pubyear,0,5) ;

View File

@ -0,0 +1,58 @@
-- Population of the doc_project relation using the data from project -> project_doc_other_id -> doc_other_identifier -> document
insert into
doc_project(projectid, docid, inferred)
select
p.projectid,
d.docid,
true as inferred
from
project_doc_other_id p
left outer join doc_other_identifier d on (p.docid = d.id and p.docidtype = d.idtype)
where
d.docid is not null
on conflict do nothing;
-- Population of the doc_doc relation using the data from document(eg: guidelines) -> doc_doc_other_id -> doc_other_identifier -> document(eg: publication)
insert into
doc_doc(docid1, docid2, reltype, inferred)
select
d.docid1 as docid1,
i.docid as docid2,
d.reltype as reltype,
true as inferred
from
doc_doc_other_id d
left outer join doc_other_identifier i on (d.docid2 = i.id and d.docid2type = i.idtype)
where
i.docid is not null
on conflict do nothing;
-- Remove redundant doc_project relations (references to MOCK PROJECTS would be counted twice, otherwise)
create table temp_delete_doc_project as select t.docid||'@'||t.projectid as item from (
select
dp.docid,
unnest(array_agg(dp.projectid)) as projectid
from
doc_project dp
left outer join project p on (dp.projectid = p.id)
group by
dp.docid, p.funder
having
array_to_string(array_agg(dp.projectid), ',', '') like '%MOCK_PROJECT%'
and array_length(array_agg(DISTINCT dp.projectid), 1) > 1
) as t
where t.projectid like '40|MOCK_PROJECT::%';
create index temp_delete_doc_project_item_idx on temp_delete_doc_project(item);
delete from doc_project where docid||'@'||projectid in (
select item
from temp_delete_doc_project
);
drop table temp_delete_doc_project;

View File

@ -0,0 +1,29 @@
#!/bin/bash
java -jar /Users/michele/.m2/repository/eu/dnetlib/data4impact-importer/1.1.0-SNAPSHOT/data4impact-importer-1.1.0-SNAPSHOT.jar \
./jsonfiles/swedishProjects/project.json \
./jsonfiles/swedishProjects/projectOtherId.json \
./jsonfiles/swedishProjects/organization.json \
./jsonfiles/swedishProjects/projectOrganization.json \
./jsonfiles/swedishProjects/docotherid.json \
./jsonfiles/swedishProjects/projectdocotherid.json \
./jsonfiles/ecProjectsOpenaire/project.json \
./jsonfiles/ecProjectsOpenaire/projectOtherId.json \
./jsonfiles/ecProjectsOpenaire/organization.json \
./jsonfiles/ecProjectsOpenaire/organizationOtherId.json \
./jsonfiles/ecProjectsOpenaire/projectOrganization.json \
./jsonfiles/cordis/project.json \
./jsonfiles/cordis/projectOtherId.json \
./jsonfiles/cordis/organization.json \
./jsonfiles/cordis/organizationOtherId.json \
./jsonfiles/cordis/projectOrganization.json \
./jsonfiles/companydata/orgCompanyMetrics.json \
./jsonfiles/funderdata/project.json \
./jsonfiles/funderdata/docotherid.json \
./jsonfiles/funderdata/projectdocotherid.json \
./jsonfiles/patents/document.json \
./jsonfiles/patents/doc_fulltext.json \
./jsonfiles/patents/doc_other_identifier.json \
./jsonfiles/guidelines/document.json \
./jsonfiles/guidelines/docotherid.json

View File

@ -0,0 +1,18 @@
# MANUAL STEPS FOR clinical trials
1) cd /data/ftp/d4i/clinical_trials
2) Recreate the table in the DB using
DROP TABLE clinical_trials;
CREATE TABLE clinical_trials (
doi text,
trial_number text,
trial_registry text
);
4) Insert data:
COPY clinical_trials(doi,trial_number,trial_registry) FROM '/data/ftp/d4i/clinical_trials/clintrial.txt' DELIMITER E'\t';
DELETE FROM clinical_trials where doi = 'pub-with-clin-trial';

View File

@ -0,0 +1,37 @@
#!/bin/bash
detailsFile=../../orig/CompanyData/D4I_companies_summary.txt
workdir=/tmp/companydata
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "CompanyData Import:"
#--------------------------------
echo " - Generating csv files"
csvDetails="$workdir/details.csv"
cat $detailsFile | jq 'to_entries' | jq 'map([.key, .value."data gathered?", .value."tangible + pre_market", .value."tangible + market", .value."intangible + pre_market", .value."intangible + market", .value."innovation?"])' | jq .[] | jq -r @csv > $csvDetails
#--------------------------------
echo " - Recreating the companydata database"
dropdb companydata --if-exists;
createdb companydata;
psql companydata -f schema.sql
if [[ -f "$csvDetails" ]]; then
echo " - Importing details: $csvDetails"
psql companydata -c "COPY companymetrics(orgid, data_gathered, tangible_pre_market, tangible_market, intangible_pre_market, intangible_market, innovation) FROM '$csvDetails' CSV;"
else
echo " - Invalid file: $csvDetails"
fi
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/companydata/*.json
psql companydata -f metrics2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/companydata/orgCompanyMetrics.json
echo "Done."
echo

View File

@ -0,0 +1,40 @@
#!/bin/bash
detailsFile=../../orig/CompanyData/D4I_company_innovation_texts.json
workdir=/tmp/companydata_texts
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "CompanyData Innovation texts Import:"
#--------------------------------
echo " - Generating csv files"
csvDetails="$workdir/details.csv"
cat $detailsFile | jq --slurp -r '(map(keys) | add | unique) as $cols | map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' > $csvDetails
#--------------------------------
echo " - Recreating the companydata_texts database"
dropdb companydata_texts --if-exists;
createdb companydata_texts;
psql companydata_texts -f schema_texts.sql
if [[ -f "$csvDetails" ]]; then
echo " - Importing details: $csvDetails"
psql companydata_texts -c "COPY data(company_id,prediction_revised,site_url,source,text_clean_gentle,text_clean_strong,text_is_duplicated) FROM '$csvDetails' CSV HEADER;"
else
echo " - Invalid file: $csvDetails"
fi
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/companydata_texts/*.json
psql companydata_texts -f innovationTexts2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/companydata_texts/orgCompanyInnovationTexts.json
echo "Done."
echo

View File

@ -0,0 +1,11 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'20|ec__________::'||MD5(company_id) AS "orgId",
prediction_revised AS "predictionRevised",
site_url AS "siteUrl",
source AS "source",
text_clean_gentle AS "textCleanGentle",
text_clean_strong AS "textCleanStrong",
text_is_duplicated AS "duplicated"
FROM data
) t) TO STDOUT;

View File

@ -0,0 +1,15 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'20|ec__________::'||MD5(orgid) AS "orgId",
(LOWER(data_gathered)='yes') AS "dataGathered",
tangible_pre_market AS "tangiblePreMarket",
tangible_market AS "tangibleMarket",
intangible_pre_market AS "intangiblePreMarket",
intangible_market AS "intangibleMarket",
CASE
WHEN innovation='0' THEN false
WHEN innovation='1' THEN true
ELSE NULL
END AS "innovation"
FROM companymetrics
) t) TO STDOUT;

View File

@ -0,0 +1,11 @@
CREATE TABLE companymetrics (
orgid text,
data_gathered varchar(5),
tangible_pre_market int,
tangible_market int,
intangible_pre_market int,
intangible_market int,
innovation varchar(5)
);

View File

@ -0,0 +1,10 @@
CREATE TABLE data (
company_id text,
prediction_revised float,
site_url text,
source text,
text_clean_gentle text,
text_clean_strong text,
text_is_duplicated boolean
);

View File

@ -0,0 +1,21 @@
Paolo, Vilius, all
Last week I was in London attending a "special" event for publishers and I
had the opportunity to meet a guy from the Strategic Initiatives dep. of
Crossref who pointed me out the events API
(https://www.crossref.org/services/event-data/). Such API links
publications to several external sources including Patents, Twitter,
Wikipedia, Reddit, StackExchange, Wordpress etc.
Running some queries on their db we saw that for Twitter they do have data
for more than a year.
For patents they are based on Gambia Lens (https://www.lens.org/) -and
they do have links from patents to pubs-. Unfortunately for some reason,
Gambia uploaded data only once and then stopped. They will talk to them to
see what has happened.
In every case, I think that such API is very useful both for D4I and OA,
and we should have a look and possibly integrate such data the soonest
possible.
All the best,
Omiros

View File

@ -0,0 +1,32 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
pmcid AS "id",
'pmcid' AS "type"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
UNION ALL
SELECT
pmid AS "id",
'pmid' AS "type"
FROM data
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
UNION ALL
SELECT
doi AS "id",
'doi' AS "type"
FROM data
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
UNION ALL
SELECT
d_b_id AS "id",
'drug_bank_id' AS "type"
FROM data
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
) t) TO STDOUT;

View File

@ -0,0 +1,36 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmcid AS "docId",
'pmcid' AS "docIdType"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmid AS "docId",
'pmid' AS "docIdType"
FROM data
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
doi AS "docId",
'doi' AS "docIdType"
FROM data
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
d_b_id AS "docId",
'drug_bank_id' AS "docIdType"
FROM data
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
) t) TO STDOUT;

View File

@ -0,0 +1,6 @@
COPY (SELECT row_to_json(t) FROM (SELECT distinct
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
'MOCK PROJECT' AS "title",
funder AS "funder"
FROM data
) t) TO STDOUT;

View File

@ -0,0 +1,10 @@
CREATE TABLE data (
id text,
d_b_id text,
doi text,
pmcid text,
pmid text,
drug_substance text,
funder text,
section_of_drug_bank_entry_where_citation_occured text
)

View File

@ -0,0 +1,52 @@
#!/bin/bash
excelFile="../../orig/drug_bank_database/Publication_citations_in_Drug_Bank_database.xlsx"
workdir=/tmp/drugbank
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Links from drugbank db Import:"
#--------------------------------
echo " - Generating csv file"
csv="$workdir/drugbank.csv"
xlsx2csv -c UTF-8 "$excelFile" > $csv
#--------------------------------
echo " - Recreating the drugbank database"
dbname=drugbank
dropdb $dbname --if-exists;
createdb $dbname;
psql $dbname -f schema.sql
if [[ -f "$csv" ]]; then
echo " - Importing data: $csv"
psql $dbname -c "COPY data(id,d_b_id,doi,pmcid,pmid,drug_substance,funder,section_of_drug_bank_entry_where_citation_occured) FROM '$csv' CSV HEADER;"
else
echo " - Invalid file: $csv"
fi
echo " - Fix funder names"
psql $dbname -c "UPDATE data SET funder='EC' WHERE funder = 'European Research Council'"
psql $dbname -c "UPDATE data SET funder='Austrian Science Fund FWF' WHERE funder = 'FWF'"
psql $dbname -c "UPDATE data SET funder='Swiss National Science Foundation SNSF' WHERE funder = 'Swiss National Science Foundation'"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/drug_bank_database/*.json
psql $dbname -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/project.json
psql $dbname -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/docotherid.json
psql $dbname -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/projectdocotherid.json
echo "Done."
echo

View File

@ -0,0 +1,32 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
pmcid AS "id",
'pmcid' AS "type"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
UNION ALL
SELECT
pmid AS "id",
'pmid' AS "type"
FROM data
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
UNION ALL
SELECT
doi AS "id",
'doi' AS "type"
FROM data
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
UNION ALL
SELECT
d_b_id AS "id",
'drug_bank_id' AS "type"
FROM data
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
) t) TO STDOUT;

View File

@ -0,0 +1,36 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
pmcid AS "docId",
'pmcid' AS "docIdType"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
UNION ALL
SELECT
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
pmid AS "docId",
'pmid' AS "docIdType"
FROM data
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
UNION ALL
SELECT
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
doi AS "docId",
'doi' AS "docIdType"
FROM data
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
UNION ALL
SELECT
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
d_b_id AS "docId",
'drug_bank_id' AS "docIdType"
FROM data
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
) t) TO STDOUT;

View File

@ -0,0 +1,11 @@
CREATE TABLE data (
doi text,
d_b_id text,
pmcid text,
pmid text,
drug_substance text,
ec_project_acronym text,
ec_project_code text,
funding_scheme text,
match_type text
);

View File

@ -0,0 +1,39 @@
#!/bin/bash
excelFile="../../orig/drug_bank_database/DB_Publication_project_links.xlsx"
workdir=/tmp/drugbank_part2
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Links from drugbank db Import:"
#--------------------------------
echo " - Generating csv file"
csv="$workdir/drugbank.csv"
xlsx2csv -c UTF-8 "$excelFile" > $csv
#--------------------------------
echo " - Recreating the drugbank database"
dbname=drugbank_p2
dropdb $dbname --if-exists;
createdb $dbname;
psql $dbname -f schema.sql
if [[ -f "$csv" ]]; then
echo " - Importing data: $csv"
psql $dbname -c "COPY data(doi,d_b_id,pmcid,pmid,drug_substance,ec_project_acronym,ec_project_code,funding_scheme,match_type) FROM '$csv' CSV HEADER;"
else
echo " - Invalid file: $csv"
fi
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/drug_bank_database/*.json
psql $dbname -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database_part2/docotherid.json
psql $dbname -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database_part2/projectdocotherid.json
echo "Done."
echo

View File

@ -0,0 +1,56 @@
#!/bin/bash
urlOrgFp7=http://cordis.europa.eu/data/cordis-fp7organizations.xlsx
urlOrgH2020=http://cordis.europa.eu/data/cordis-h2020organizations.xlsx
workdir=/tmp/cordis
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "cordis Import:"
#--------------------------------
echo " - Downloading files"
wget "$urlOrgFp7" -O "$workdir/fp7orgs.xlsx" -q --show-progress
wget "$urlOrgH2020" -O "$workdir/h2020orgs.xlsx" -q --show-progress
#--------------------------------
echo " - Generating csv files"
csvfp7="$workdir/fp7orgs.csv"
csvh2020="$workdir/h2020orgs.csv"
xlsx2csv -c UTF-8 "$workdir/fp7orgs.xlsx" > $csvfp7
xlsx2csv -c UTF-8 "$workdir/h2020orgs.xlsx" > $csvh2020
#--------------------------------
echo " - Recreating the cordis database"
dropdb cordis --if-exists;
createdb cordis;
psql cordis -f schema.sql
if [[ -f "$csvfp7" ]]; then
echo " - Importing FP7 participants: $csvfp7"
psql cordis -c "COPY participants(projectrcn,projectiD,projectacronym,role,orgid,orgname,orgshortname,activitytype,endofparticipation,eccontribution,country,street,city,postCode,organizationurl,vatnumber,contactform,contacttype,contacttitle,contactfirstnames,contactlastnames,contactfunction,contacttelephonenumber,contactfaxnumber) FROM '$csvfp7' CSV HEADER;"
psql cordis -c "UPDATE participants SET fundingprogram='FP7' WHERE fundingprogram IS NULL"
else
echo " - Invalid file fp7: $csvfp7"
fi
if [[ -f "$csvh2020" ]]; then
echo " - Importing H2020 participants: $csvh2020"
psql cordis -c "COPY participants(projectrcn,projectiD,projectacronym,role,orgid,orgname,orgshortname,activitytype,endofparticipation,eccontribution,country,street,city,postCode,organizationurl,vatnumber,contactform,contacttype,contacttitle,contactfirstnames,contactlastnames,contactfunction,contacttelephonenumber,contactfaxnumber) FROM '$csvh2020' CSV HEADER;"
psql cordis -c "UPDATE participants SET fundingprogram='H2020' WHERE fundingprogram IS NULL"
else
echo " - Invalid file h2020: $csvh2020"
fi
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/cordis/*.json
psql cordis -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/project.json
psql cordis -f orgs2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/organization.json
psql cordis -f projOrg2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/projectOrganization.json
psql cordis -f projOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/projectOtherId.json
psql cordis -f orgOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/organizationOtherId.json
echo "Done."
echo

View File

@ -0,0 +1,9 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'20|ec__________::'||MD5(orgid) AS "orgId",
orgid AS "id",
'ec:PIC' AS "type"
FROM participants
WHERE orgid IS NOT NULL
GROUP BY orgid
) t) TO STDOUT;

View File

@ -0,0 +1,13 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'20|ec__________::'||MD5(orgid) AS "id",
MAX(orgname) AS "name",
MAX(orgshortname) AS "shortName",
MAX(country) AS "country",
MAX(street) AS "street",
MAX(city) AS "city",
MAX(postcode) AS "postCode",
MAX(organizationurl) AS "url"
FROM participants
WHERE orgid IS NOT NULL
GROUP BY orgid
) t) TO STDOUT;

View File

@ -0,0 +1,26 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
CASE
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
ELSE '40|unknown_____::'||MD5(projectid)
END AS "projectId",
'20|ec__________::'||MD5(orgid) AS "orgId",
MAX(role) AS "role",
MAX(activitytype) AS "activityType",
MAX(endofparticipation) AS "endOfParticipation",
MAX(eccontribution) AS "ecContribution",
MAX(contacttype) AS "contactType",
MAX(contacttitle) AS "contactTitle",
MAX(contactfirstnames) AS "contactFirstNames",
MAX(contactlastnames) AS "contactLastNames",
MAX(contactfunction) AS "contactFunction",
MAX(contacttelephonenumber) AS "contactTelephoneNumber",
MAX(contactfaxnumber) AS "contactFaxNumber",
MAX(contactform) AS "contactForm"
FROM participants
WHERE orgid IS NOT NULL AND projectid IS NOT NULL
GROUP BY orgid, projectid, fundingprogram
) t) TO STDOUT;

View File

@ -0,0 +1,27 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
CASE
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
ELSE '40|unknown_____::'||MD5(projectid)
END AS "projectId",
projectid AS "id",
'ec:grant_id' AS "type"
FROM participants
WHERE projectid IS NOT NULL
GROUP BY projectid, fundingprogram
UNION ALL
SELECT
CASE
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
ELSE '40|unknown_____::'||MD5(projectid)
END AS "projectId",
MAX(projectrcn) AS "id",
'ec:RCN' AS "type"
FROM participants
WHERE projectid IS NOT NULL AND projectrcn IS NOT NULL
GROUP BY projectid, fundingprogram
) t) TO STDOUT;

View File

@ -0,0 +1,16 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
CASE
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
ELSE '40|unknown_____::'||MD5(projectid)
END AS "id",
MAX(projectacronym) AS "acronym",
'EC' AS "funder",
fundingprogram AS "fundingLevel0"
FROM participants
WHERE projectid IS NOT NULL
GROUP BY
projectid,
fundingprogram
) t) TO STDOUT;

View File

@ -0,0 +1,28 @@
CREATE TABLE participants (
projectrcn text,
projectid text,
projectacronym text,
role text,
orgid text,
orgname text,
orgshortname text,
activitytype text,
endofparticipation text,
eccontribution text,
country text,
street text,
city text,
postcode text,
organizationurl text,
vatnumber text,
contacttype text,
contacttitle text,
contactfirstnames text,
contactlastnames text,
contactfunction text,
contacttelephonenumber text,
contactfaxnumber text,
contactform text,
fundingprogram varchar(10)
);

View File

@ -0,0 +1 @@
The script should be launched in OpenAIRE production server (services.openaire.eu)

View File

@ -0,0 +1,16 @@
#!/bin/bash
BASEDIR=/tmp/ecProjectsOpenaire
echo "Saving files in $BASEDIR ..."
rm -rf $BASEDIR
mkdir $BASEDIR
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projects2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/project.json
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f orgs2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/organization.json
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projOrg2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/projectOrganization.json
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f orgOtherId2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/organizationOtherId.json
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projOtherId2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/projectOtherId.json
echo Done.

View File

@ -0,0 +1,8 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'20|ec__________::'||MD5(substring(id from 15)) AS "orgId",
substring(id from 15) AS "id",
'ec:PIC' AS "type"
FROM dsm_organizations
WHERE id LIKE 'corda%'
) t) TO STDOUT;

View File

@ -0,0 +1,28 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'20|ec__________::'||MD5(substring(o.id from 15)) AS "id",
o.legalname AS "name",
o.legalshortname AS "shortName",
o.country AS "country",
o.websiteurl AS "url",
o.ec_legalbody AS "ecLegalBody",
o.ec_legalperson AS "ecLegalPerson",
o.ec_nonprofit AS "ecNonProfit",
o.ec_researchorganization AS "ecResearchOrganization",
o.ec_highereducation AS "ecHigherEducation",
o.ec_internationalorganizationeurinterests AS "ecInternationalOrganizationEurInterests",
o.ec_internationalorganization AS "ecInternationalOrganization",
o.ec_enterprise AS "ecEnterprise",
o.ec_smevalidated AS "ecSmeValidated",
o.ec_nutscode AS "ecNutsCode"
FROM
dsm_organizations o
LEFT OUTER JOIN project_organization po ON (po.resporganization = o.id)
WHERE
o.id LIKE 'corda%'
) t) TO STDOUT;

View File

@ -0,0 +1,13 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'40|'||substring(project from 1 for 12)||'::'||MD5(substring(project from 15)) AS "projectId",
'20|ec__________::'||MD5(substring(resporganization from 15)) AS "orgId",
semanticclass AS "role"
FROM
project_organization
WHERE
project LIKE 'corda%'
) t) TO STDOUT;

View File

@ -0,0 +1,8 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|'||substring(id from 1 for 12)||'::'||MD5(substring(id from 15)) AS "projectId",
code AS "id",
'ec:grant_id' AS "type"
FROM projects
WHERE id LIKE 'corda%'
) t) TO STDOUT;

View File

@ -0,0 +1,23 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'40|'||substring(p.id from 1 for 12)||'::'||MD5(substring(p.id from 15)) AS "id",
p.title AS "title",
p.acronym AS "acronym",
p.call_identifier AS "callId",
split_part(pf.funding, '::', 2) AS "funder",
split_part(pf.funding, '::', 3) AS "fundingLevel0",
split_part(pf.funding, '::', 4) AS "fundingLevel1",
split_part(pf.funding, '::', 5) AS "fundingLevel2",
p.startdate AS "startDate",
p.enddate AS "endDate",
p.websiteurl AS "websiteUrl",
p.keywords AS "keywords",
p.contracttypescheme||':'||p.contracttypeclass AS "contractType",
p.ec_sc39 AS "ecSc39",
p.oa_mandate_for_publications AS "oaMandateForPublications",
p.ec_article29_3 AS "ecArticle29_3"
FROM
projects p
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
WHERE
p.id LIKE 'corda%'
) t) TO STDOUT;

View File

@ -0,0 +1,24 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
pmcid AS "id",
'pmcid' AS "type"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION ALL
SELECT
pmid AS "id",
'pmid' AS "type"
FROM data
WHERE pmid IS NOT NULL AND pmid != ''
UNION ALL
SELECT
doi AS "id",
'doi' AS "type"
FROM data
WHERE doi IS NOT NULL AND doi != ''
) t) TO STDOUT;

View File

@ -0,0 +1,51 @@
#!/bin/bash
excelFile="../../orig/fundersData/Funders, DOIS 31122018.xlsx"
workdir=/tmp/funderData
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Funder Data Import:"
#--------------------------------
echo " - Generating csv file"
csv="$workdir/funderdata.csv"
xlsx2csv -c UTF-8 "$excelFile" > $csv
#--------------------------------
echo " - Recreating the funderdata database"
dropdb funderdata --if-exists;
createdb funderdata;
psql funderdata -f schema.sql
if [[ -f "$csv" ]]; then
echo " - Importing data: $csv"
psql funderdata -c "COPY data(funder,pmcid,pmid,source,doi) FROM '$csv' CSV HEADER;"
else
echo " - Invalid file: $csv"
fi
echo " - Fix funder names"
psql funderdata -c "UPDATE data SET funder='EC' WHERE funder = 'Marie Curie'"
psql funderdata -c "UPDATE data SET funder='EC' WHERE funder = 'European Research Council'"
psql funderdata -c "UPDATE data SET funder='Breast Cancer Now' WHERE funder = 'BreastCancerNow'"
psql funderdata -c "UPDATE data SET funder='Wellcome Trust' WHERE funder = 'Wellcome Trust/DBT India Alliance'"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/funderdata/*.json
psql funderdata -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/project.json
psql funderdata -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/docotherid.json
psql funderdata -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/projectdocotherid.json
echo "Done."
echo

View File

@ -0,0 +1,26 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmcid AS "docId",
'pmcid' AS "docIdType"
FROM data
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmid AS "docId",
'pmid' AS "docIdType"
FROM data
WHERE pmid IS NOT NULL AND pmid != ''
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
doi AS "docId",
'doi' AS "docIdType"
FROM data
WHERE doi IS NOT NULL AND doi != ''
) t) TO STDOUT;

View File

@ -0,0 +1,6 @@
COPY (SELECT row_to_json(t) FROM (SELECT distinct
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
'MOCK PROJECT' AS "title",
funder AS "funder"
FROM data
) t) TO STDOUT;

View File

@ -0,0 +1,8 @@
CREATE TABLE data (
funder text,
pmcid text,
pmid text,
source text,
doi text
);

View File

@ -0,0 +1,21 @@
Dear Claudio, Please find the json file containing the clinical guideline base data attached.
It is formatted as follows:
LocalID [Our local guideline ID]
Type "guideline"
Title [Title of guideline]
PubYear [Guideline publication year]
Originator [Organization that created the guideline (subset of ProviderCollection)]
ProviderCollection [Collection name]
Abstract [Guideline abstract (from PubMed, if available (only from WHO, NICE and Cochrane))]
PMID [PMID if available]
DOI [DOI if available]
PMCID [PMCID if available]
MatchedReferences: [references matched with Our set of publications as PMID (as well as PMCID and funder name)]
[All]References: [All references in each guideline]
We also have the full text for WHO, NICE and Cochrane, as well as the PDF:s for the German AWMF guidelines, but it is still uncertain how this material could be shared due to copyright issues.
FILE: /data/d4i/guidelines.json.zip

View File

@ -0,0 +1,5 @@
Al momento non sono gestite le MatchedReferences.
Il modello attuale prevede la tabella Citation per mmetere in relazione due documenti.
Forse deve essere rivisto.

View File

@ -0,0 +1,39 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'50|guidelines__::'||MD5(gid) AS "docId1",
pmcid AS "docId2",
'pmcid' AS "docId2Type",
'guidelines_matched' AS "relType"
FROM relations
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION
SELECT
'50|guidelines__::'||MD5(gid) AS "docId1",
pmid AS "docId2",
'pmid' AS "docId2Type",
'guidelines_matched' AS "relType"
FROM relations
WHERE pmid IS NOT NULL AND pmid != ''
UNION
SELECT
'50|guidelines__::'||MD5(gid) AS "docId1",
doi AS "docId2",
'doi' AS "docId2Type",
'guidelines_matched' AS "relType"
FROM relations
WHERE doi IS NOT NULL AND doi != ''
UNION
SELECT
'50|guidelines__::'||MD5(gid) AS "docId1",
rel AS "docId2",
'pmid' AS "docId2Type",
'guidelines_all' AS "relType"
FROM allrefs
WHERE rel IS NOT NULL AND rel != ''
) t) TO STDOUT;

View File

@ -0,0 +1,72 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'50|guidelines__::'||MD5(id) AS "docId",
id AS "id",
'guidelineLocalID' AS "type"
FROM guidelines
UNION ALL
SELECT
'50|guidelines__::'||MD5(id) AS "docId",
pmcid AS "id",
'pmcid' AS "type"
FROM guidelines
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION ALL
SELECT
'50|guidelines__::'||MD5(id) AS "docId",
pmid AS "id",
'pmid' AS "type"
FROM guidelines
WHERE pmid IS NOT NULL AND pmid != ''
UNION ALL
SELECT
'50|guidelines__::'||MD5(id) AS "docId",
doi AS "id",
'doi' AS "type"
FROM guidelines
WHERE doi IS NOT NULL AND doi != ''
UNION
SELECT
NULL AS "docId",
pmcid AS "id",
'pmcid' AS "type"
FROM relations
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION
SELECT
NULL AS "docId",
pmid AS "id",
'pmid' AS "type"
FROM relations
WHERE pmid IS NOT NULL AND pmid != ''
UNION
SELECT
NULL AS "docId",
doi AS "id",
'doi' AS "type"
FROM relations
WHERE pmid IS NOT NULL AND doi != ''
UNION
SELECT
NULL AS "docId",
rel AS "id",
'pmid' AS "type"
FROM allrefs
WHERE rel IS NOT NULL AND rel != ''
) t) TO STDOUT;

View File

@ -0,0 +1,12 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'50|guidelines__::'||MD5(g.id) AS "id",
g.title AS "title",
g.abstract AS "abstractText",
g.gtype AS "type",
g.year AS "pubYear",
g.orig AS "repository",
g.collection AS "collection"
FROM
guidelines g
) t) TO STDOUT;

View File

@ -0,0 +1,67 @@
#!/bin/bash
#detailsFile=../../orig/guidelines/guidelines.json
detailsFile=/tmp/guidelines.json
workdir=/tmp/guidelines
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Guidelines Import:"
#--------------------------------
echo " - Generating csv files"
csvGuidelines="$workdir/guidelines.csv"
csvRels="$workdir/rels.csv"
csvAllRels="$workdir/allRels.csv"
cat $detailsFile | jq 'map([.LocalID, .Type, .Title, .PubYear, .Originator, .ProviderCollection, .Abstract, .PMID, .DOI, .PMCID])' | jq .[] | jq -r @csv > $csvGuidelines
cat $detailsFile | jq -r '.[] | .LocalID as $id | (.MatchedReferences | map([$id, (.PMID + ""), (.PMCID + ""), (.DOI + "") , ( .Funders | map(.+"#") | add | . + "" ) ]) )[] | @csv' > $csvRels
cat $detailsFile | jq -r '.[] | .LocalID as $id | (.AllReferences | map([$id, .]) )[] | @csv' > $csvAllRels
#--------------------------------
echo " - Recreating the guidelines database"
dropdb guidelines --if-exists;
createdb guidelines;
psql guidelines -f schema.sql
if [[ -f "$csvGuidelines" ]]; then
echo " - Importing guidelines: $csvGuidelines"
psql guidelines -c "COPY guidelines(id, gtype, title, year, orig, collection, abstract, pmid, doi, pmcid) FROM '$csvGuidelines' CSV;"
else
echo " - Invalid file: $csvGuidelines"
fi
if [[ -f "$csvRels" ]]; then
echo " - Importing rels: $csvRels"
psql guidelines -c "COPY relations(gid, pmid, pmcid, doi, funder) FROM '$csvRels' CSV;"
else
echo " - Invalid file: $csvRels"
fi
if [[ -f "$csvAllRels" ]]; then
echo " - Importing all rels: $csvAllRels"
psql guidelines -c "COPY allrefs(gid, rel) FROM '$csvAllRels' CSV;"
else
echo " - Invalid file: $csvAllRels"
fi
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/guidelines/*.json
psql guidelines -f document2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/document.json
psql guidelines -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/docotherid.json
psql guidelines -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/project.json
psql guidelines -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/projectdocotherid.json
psql guidelines -f docDocumentOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/docDocumentOtherId.json
#--------------------------------
echo " - Importing final files"
cd ../../jsonfiles/guidelines
echo "Done."
echo

View File

@ -0,0 +1,20 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmcid AS "docId",
'pmcid' AS "docIdType"
FROM (select * from (select pmid, pmcid, unnest(string_to_array(funder, '#')) as funder from relations) as t where length(t.funder) > 0) r
WHERE pmcid IS NOT NULL AND pmcid != ''
UNION ALL
SELECT
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
pmid AS "docId",
'pmid' AS "docIdType"
FROM (select * from (select pmid, pmcid, unnest(string_to_array(funder, '#')) as funder from relations) as t where length(t.funder) > 0) r
WHERE pmid IS NOT NULL AND pmid != ''
) t) TO STDOUT;

View File

@ -0,0 +1,7 @@
COPY (SELECT row_to_json(t) FROM (SELECT distinct
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
'MOCK PROJECT' AS "title",
funder AS "funder"
FROM
(SELECT DISTINCT unnest(string_to_array(funder, '#')) AS funder FROM relations ) r WHERE LENGTH(r.funder) > 0
) t) TO STDOUT;

View File

@ -0,0 +1,25 @@
CREATE TABLE guidelines (
id text,
gtype text,
title text,
year text,
orig text,
collection text,
abstract text,
pmid text,
doi text,
pmcid text
);
CREATE TABLE relations (
gid text,
pmid text,
pmcid text,
doi text,
funder text
);
CREATE TABLE allrefs (
gid text,
rel text
);

View File

@ -0,0 +1,49 @@
#!/bin/bash
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
table=network_analysis_metrics
echo "Recrreating table $table"
psql -h localhost -U d4i data4impact -c "DROP TABLE IF EXISTS $table;"
psql -h localhost -U d4i data4impact -c "CREATE TABLE $table (betweenness_centrality double precision,closeness_centrality double precision,degree_centrality double precision,eccentricity_centrality double precision,eigenvector_centrality double precision,farness_centrality double precision,pic text,name text, icd int, period text, orgid text);"
echo
for icd in {1..19}
do
if [ -d "$DIR/$icd" ]; then
cd "$DIR/$icd"
for csv in *.csv
do
y1=$(echo $csv | cut -c1-4)
y2=$(expr $y1 + 1)
period="$y1-$y2"
echo "Processing file $DIR/$icd/$csv..."
if grep --quiet eccentricity_centrality "$DIR/$icd/$csv"; then
psql -h localhost -U d4i data4impact -c "COPY $table (betweenness_centrality,closeness_centrality,degree_centrality,eccentricity_centrality,eigenvector_centrality,farness_centrality,pic,name) FROM '$DIR/$icd/$csv' CSV HEADER;"
else
psql -h localhost -U d4i data4impact -c "COPY $table (betweenness_centrality,closeness_centrality,degree_centrality,eigenvector_centrality,farness_centrality,pic,name) FROM '$DIR/$icd/$csv' CSV HEADER;"
fi
psql -h localhost -U d4i data4impact -c "UPDATE $table SET (icd,period) = ($icd,'$period') WHERE icd IS NULL;"
echo;
done
fi
done
echo "Fixing values..."
psql -h localhost -U d4i data4impact -c "UPDATE $table SET pic = replace(pic, '.0', '') WHERE pic IS NOT NULL;"
psql -h localhost -U d4i data4impact -c "UPDATE $table SET orgid = '20|ec__________::'||MD5(pic) WHERE pic IS NOT NULL;"
echo
echo "Done."
echo
echo

View File

@ -0,0 +1,211 @@
# MANUAL STEPS FOR news-blogs-forum
1) cd /data/ftp/d4i/social_data/news-blogs-forum
2) find *.zip -exec bash -c "unzip -p {} | jq --slurp -r 'map([.content,.actor,.topicId,.mediatype,.source,.headline,.url,.dt,.language,.country]) | .[] | @csv'" \; | sed 's/\x00//g' > data4impact_corpus_allmedia.csv
3) Recreate the table in the DB using
DROP TABLE socialdata;
DROP SEQUENCE socialdata_serial;
CREATE SEQUENCE socialdata_serial START 1;
CREATE TABLE socialdata (
id text PRIMARY KEY DEFAULT '51|social__data::'||MD5(nextval('socialdata_serial')::text),
content text,
actor text,
topicId text,
mediatype text,
source text,
headline text,
url text,
dt text,
language text,
country text
);
4) Insert data:
COPY socialdata(content,actor,topicId,mediatype,source,headline,url,dt,language,country) FROM '/data/ftp/d4i/social_data/news-blogs-forum/data4impact_corpus_allmedia.csv' CSV;
(OPTIONAL) if (error_during_copy) -> perl -pi -e 's/\x00//g' data4impact_corpus_allmedia.csv
#############################################################################################################################################################
# MANUAL STEPS FOR twitter (Buzz)
1) cd "/data/ftp/d4i/social_data/twitter/Buzz JSON Feb"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.rtid,.rtDate,.date,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter_buzz.csv
3) recreate the table
DROP TABLE twitterbuzz;
DROP SEQUENCE twitterbuzz_serial;
CREATE SEQUENCE twitterbuzz_serial START 1;
CREATE TABLE twitterbuzz (
id text PRIMARY KEY DEFAULT '52|twitter_buzz::'||MD5(nextval('twitterbuzz_serial')::text),
tags text,
language text,
country text,
content text,
topicid text,
sourcetype text,
source text,
actor text,
rtid text,
rtdate text,
date text,
headline text
);
CREATE TABLE twitterbuzz_tags(
tb_id text REFERENCES twitterbuzz(id),
tag text,
PRIMARY KEY(tb_id, tag)
);
4) Insert data:
COPY twitterbuzz(tags,language,country,content,topicid,sourcetype,source,actor,rtid,rtdate,date,headline) FROM '/data/ftp/d4i/social_data/twitter/Buzz JSON Feb/twitter_buzz.csv' CSV;
5) patch data:
UPDATE twitterbuzz SET tags = '' WHERE tags = 'null';
UPDATE twitterbuzz SET tags = replace(tags, ',,', ',') WHERE tags LIKE '%,,%';
insert into twitterbuzz_tags(tb_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitterbuzz) as t where tag != '';
alter table twitterbuzz drop column tags;
#############################################################################################################################################################
# MANUAL STEPS FOR twitter
1) cd "/data/ftp/d4i/social_data/twitter/Corrected JSON"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.retweetedActor,(.urls | join("§")),.datetime,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter.csv
DROP TABLE twitter;
DROP SEQUENCE twitter_serial;
CREATE SEQUENCE twitter_serial START 1;
CREATE TABLE twitter (
id text PRIMARY KEY DEFAULT '52|twitter_____::'||MD5(nextval('twitter_serial')::text),
tags text,
language text,
country text,
content text,
topicid text,
sourcetype text,
source text,
actor text,
retweetedactor text,
urls text,
datetime timestamp,
headline text
);
CREATE TABLE twitter_tags(
t_id text REFERENCES twitter(id),
tag text,
PRIMARY KEY(t_id, tag)
);
CREATE TABLE twitter_urls(
t_id text REFERENCES twitter(id),
url text,
PRIMARY KEY(t_id, url)
);
4) Insert data:
COPY twitter(tags,language,country,content,topicid,sourcetype,source,actor,retweetedactor,urls,datetime,headline) FROM '/data/ftp/d4i/social_data/twitter/Corrected JSON/twitter.csv' CSV;
5) patch data:
insert into twitter_tags(t_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitter) as t where tag != '' and tag != 'null';
insert into twitter_urls(t_id, url) select distinct * from (select id, regexp_split_to_table(urls, '§') as url from twitter) as t where url != '' and url != 'null';
alter table twitter drop column tags;
alter table twitter drop column urls;
#############################################################################################################################################################
# MANUAL STEPS FOR twitter_threads
1) cd "/data/ftp/d4i/social_data/twitter/Threads"
2)
jq -r 'map([.threadId, .length, .velocity, .participants, .startId, .startTime, .endTime]) | .[] | @csv' twitter_threads_metadata.json > twitter_threads_metadata.csv
rm twitter_threads_tweets.csv
# jq-1.6 is required
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_doi.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q1.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q2.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q3.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q4.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q5.json >> twitter_threads_tweets.csv
3)
DROP TABLE IF EXISTS twitter_threads
CREATE TABLE twitter_threads (
id text PRIMARY KEY,
length int,
velocity double precision,
participants int,
startid int,
starttime timestamp,
endtime timestamp
);
CREATE TABLE twitter_threads_tweets(
threadid text REFERENCES twitter_threads(id),
tweetid int,
fromuser text,
touser text,
inreplyto int,
datetime timestamp,
content text,
quotedstatus text,
mentions text,
urls text,
PRIMARY KEY (threadid, tweetid)
);
CREATE TABLE twitter_threads_tweets_mentions(
threadid text,
tweetid int,
mention text,
PRIMARY KEY (threadid, tweetid, mention),
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);
CREATE TABLE twitter_threads_tweets_urls(
threadid text,
tweetid int,
url text,
PRIMARY KEY (threadid, tweetid, url),
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);
4) Insert data:
COPY twitter_threads(id, length, velocity, participants, startid, starttime, endtime) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_metadata.csv' CSV;
COPY twitter_threads_tweets(threadid, tweetid, fromuser, touser, inreplyto, datetime, content, quotedstatus, mentions, urls) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_tweets.csv' CSV;
5) patch data:
insert into twitter_threads_tweets_mentions(threadid, tweetid, mention) select distinct * from (select threadid, tweetid, regexp_split_to_table(mentions, '§') as mention from twitter_threads_tweets) as t where mention != '' and mention != 'null';
insert into twitter_threads_tweets_urls (threadid, tweetid, url) select distinct * from (select threadid, tweetid, regexp_split_to_table(urls, '§') as url from twitter_threads_tweets) as t where url != '' and url != 'null';
alter table twitter_threads_tweets drop column mentions;
alter table twitter_threads_tweets drop column urls;

View File

@ -0,0 +1,43 @@
#!/bin/bash
excelFile="../../orig/patents/FP7_patents_full_list_Except_for_ICT.xlsx"
workdir=/tmp/patentsExcel
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Patents Import:"
#--------------------------------
echo " - Generating csv file"
csv="$workdir/patents.csv"
xlsx2csv -c UTF-8 "$excelFile" > $csv
#--------------------------------
echo " - Recreating the patents_excel database"
dropdb patents_excel --if-exists;
createdb patents_excel;
psql patents_excel -f schema.sql
if [[ -f "$csv" ]]; then
echo " - Importing data: $csv"
psql patents_excel -c "COPY data(pat_id,type_ip,appnum,appnt,title,pat_url,pat_ref,pat_auth,pat_num,pat_kind,note,appln_id,appln_title_patstat,priority_year,var15,projectid) FROM '$csv' CSV HEADER;"
else
echo " - Invalid file: $csv"
fi
psql patents -c "REFRESH MATERIALIZED VIEW document"
psql patents -c "REFRESH MATERIALIZED VIEW doc_other_identifier"
psql patents -c "REFRESH MATERIALIZED VIEW doc_project"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/patents_excel/*.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM document ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/document.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_other_identifier) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_other_identifier.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_project.json
echo "Done."
echo

View File

@ -0,0 +1,42 @@
CREATE TABLE data(
pat_id text,
type_ip text,
appnum text,
appnt text,
title text,
pat_url text,
pat_ref text,
pat_auth text,
pat_num text,
pat_kind text,
note text,
appln_id text,
appln_title_patstat text,
priority_year text,
var15 text,
projectid text
);
CREATE MATERIALIZED VIEW document AS SELECT
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "id",
title AS "title",
lower(regexp_replace(type_ip,'s$','')) AS "type",
priority_year AS "pubYear",
'patent repo'::text AS "repository"
FROM data
WHERE appln_id IS NOT NULL AND trim(appln_id) != '';
CREATE MATERIALIZED VIEW doc_other_identifier AS SELECT
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "docId",
trim(appln_id) AS "id",
'patent'::text AS "type"
FROM data
WHERE appln_id IS NOT NULL AND trim(appln_id) != '';
CREATE MATERIALIZED VIEW doc_project AS SELECT
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "docId",
'40|corda_______::'||MD5(lower(trim(projectid))) AS "projectId"
FROM data
WHERE appln_id IS NOT NULL AND trim(appln_id) != '' AND projectid IS NOT NULL AND trim(projectid) != '';

View File

@ -0,0 +1,3 @@
Patent data are available at ftp://prozac.madgik.di.uoa.gr
username: patentdata
passwd: d4ipatents

View File

@ -0,0 +1,48 @@
#!/bin/bash
#jsonPatents=../../orig/patents/patents.json
#jsonFulltexts=../../orig/patents/patents_txt.json
jsonPatents=../../orig/patents/patents_update.json
jsonFulltexts=../../orig/patents/patents_update_txt.json
echo
echo "Patents Import:"
#--------------------------------
echo " - Recreating the patents database"
dropdb patents --if-exists
createdb patents
psql patents -f schema.sql
#--------------------------------
inputJsonPatentsFile="$(cd "$(dirname "$jsonPatents")"; pwd -P)/$(basename "$jsonPatents")"
echo " - Importing json $inputJsonPatentsFile"
psql patents -c "copy patents_json from '$inputJsonPatentsFile' csv quote e'\x01' delimiter e'\x02'"
#--------------------------------
inputJsonFulltextsFile="$(cd "$(dirname "$jsonFulltexts")"; pwd -P)/$(basename "$jsonFulltexts")"
echo " - Importing json $jsonFulltexts"
psql patents -c "copy patents_text_json from '$inputJsonFulltextsFile' csv quote e'\x01' delimiter e'\x02'"
#--------------------------------
echo " - Refreshing views"
psql patents -c "REFRESH MATERIALIZED VIEW document"
psql patents -c "REFRESH MATERIALIZED VIEW doc_fulltext"
psql patents -c "REFRESH MATERIALIZED VIEW doc_other_identifier"
psql patents -c "REFRESH MATERIALIZED VIEW project"
psql patents -c "REFRESH MATERIALIZED VIEW doc_project"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/patents/*.json
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM document ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/document.json
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_fulltext ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_fulltext.json
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_other_identifier) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_other_identifier.json
# COMMENT THE FOLLOWING LINES IF THE PATENTS ARE NOT RELATED TO FP7
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/project.json
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_project.json
echo "Done."
echo

View File

@ -0,0 +1,41 @@
CREATE TABLE patents_json (
json text
);
CREATE TABLE patents_text_json (
json text
);
CREATE MATERIALIZED VIEW document AS SELECT
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "id",
p->>'Title' AS "title",
p->>'Abstract' AS "abstractText",
p->>'Type' AS "type",
p->>'PubYear' AS "pubYear",
'patent repo'::text AS "repository"
FROM (SELECT replace(json,'\\','\"')::json AS p FROM patents_json) a;
CREATE MATERIALIZED VIEW doc_fulltext AS SELECT
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
trim(p->>'text') AS "fulltext"
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_text_json) a
WHERE length(trim(p->>'text')) > 0;
CREATE MATERIALIZED VIEW doc_other_identifier AS SELECT
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
trim(p->>'LocalID') AS "id",
'patent'::text AS "type"
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_json) a;
CREATE MATERIALIZED VIEW project AS SELECT
'50|MOCK_PROJECT::'||MD5('EC_FP7')::text AS "id",
'MOCK PROJECT'::text AS "title",
'EC'::text AS "funder",
'FP7'::text AS "fundingLevel0";
CREATE MATERIALIZED VIEW doc_project AS SELECT
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
'50|MOCK_PROJECT::'||MD5('EC_FP7') AS "projectId"
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_json) a;

View File

@ -0,0 +1 @@
usare i documenti nella relativa directory

View File

@ -0,0 +1,381 @@
alter table project_portfolio add column json json;
alter table project_portfolio add column administrative_data json;
alter table project_portfolio add column governance_data json;
--sections
alter table project_portfolio add column executive_summary json;
alter table project_portfolio add column final_report_summary json;
alter table project_portfolio add column impact json;
alter table project_portfolio add column objective json;
alter table project_portfolio add column title json;
--/sections
update project_portfolio set json = convert_from(decode(portfolio, 'base64'), 'UTF8')::json ;
update project_portfolio set administrative_data = json->'administrative_data';
update project_portfolio set governance_data = json->'governance_data';
update project_portfolio set executive_summary = json->'sections'->'executive_summary';
update project_portfolio set final_report_summary = json->'sections'->'final_report_summary';
update project_portfolio set impact = json->'sections'->'impact';
update project_portfolio set objective = json->'sections'->'objective';
update project_portfolio set results_in_brief = json->'sections'->'results_in_brief';
update project_portfolio set results = json->'sections'->'results';
update project_portfolio set impact = json->'sections'->'impact';
update project_portfolio set title = json->'sections'->'title';
-- document
INSERT INTO DOCUMENT
(id,
title,
abstract,
doctype,
repository,
rights,
pubyear)
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_object'),
'40|corda_______', '50|fp7___object') AS id,
'Objectives of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
objective ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020summary'),
'40|corda_______', '50|fp7__summary') AS id,
'Final report summary of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
final_report_summary ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020___exec'),
'40|corda_______', '50|fp7_____exec') AS id,
'Executive summary of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
executive_summary ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020__brief'),
'40|corda_______', '50|fp7____brief') AS id,
'Results in brief of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
results_in_brief ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020results'),
'40|corda_______', '50|fp7__results') AS id,
'Results of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
results ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE results ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_impact'),
'40|corda_______', '50|fp7___impact') AS id,
'Impact of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
impact ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE impact ->> 'text' IS NOT NULL
-- doc_project
INSERT INTO doc_project
(docid,
projectid,
inferred)
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_object'),
'40|corda_______', '50|fp7___object') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020summary'),
'40|corda_______', '50|fp7__summary') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020___exec'),
'40|corda_______', '50|fp7_____exec') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020__brief'),
'40|corda_______', '50|fp7____brief') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020results'),
'40|corda_______', '50|fp7__results') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE results ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_impact'),
'40|corda_______', '50|fp7___impact') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE impact ->> 'text' IS NOT NULL
-- updates the project table with data from the project_portfolios
update project p set (total_cost,contribution,currency) = ((administrative_data->>'Total cost')::numeric, (administrative_data->>'contribution')::numeric, 'EURO'::text) from
project_portfolio pp where pp.projectid = p.id ;
update project_organization po set (contribution, currency) = (U.contribution, 'EURO'::text) from
(
select projectid, '20|ec__________::'||MD5(o->>'pic') as orgid, (o->>'contribution')::numeric as contribution from
(
select projectid, json_array_elements(administrative_data->'coordinators') as o from project_portfolio
union all
select projectid, json_array_elements(administrative_data->'participants') as o from project_portfolio
) as T
) as U where po.orgid = U.orgid and po.projectid = U.projectid
-- include start/end dates from project portfolios
update project p set startdate = pp.administrative_data->>'date_from' from project_portfolio pp where p.startdate is null and p.id = pp.projectid ;
update project p set enddate = pp.administrative_data->>'date_to' from project_portfolio pp where p.enddate is null and p.id = pp.projectid ;
-- work in progress, waiting for ARC to fix the data. For now we keep only the activity types that do not contain any new-line characters
update project_organization po set activitytype = o.activitytype from (select '20|ec__________::'||MD5(o->>'pic') as orgid, o->>'activity_type' as activitytype from
(
select json_array_elements(administrative_data->'coordinators') as o from project_portfolio
union all
select json_array_elements(administrative_data->'participants') as o from project_portfolio
) as T
where o->>'pic' is not null and o->>'activity_type' !~ E'.*\n.*') o where po.orgid = o.orgid;
-- extract PubMed publications from the project portfolios
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | map([.key, .value.ArticleTitle, .value.AbstractText, .value.ArticleDate]) | .[] | @csv' {} \; > ../document_pp.csv
find . -name '*.json' -exec bash -c "jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as \$id | .value.Authors | to_entries | .[] | .key as \$i | { docid : \$id, fullname : (.value.LastName+\", \"+.value.ForeName), rank: (map(\$i+1) | unique | .[0]) } ) ' \"{}\" | jq -s -r 'map([.docid, .fullname, .rank]) | .[] | @csv' " \; > ../doc_author_pp.csv
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.OtherIDs | map([$id, .Source, .id ] )) | .[] | @csv ' {} \; > ../doc_other_id.csv
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv 15:50:53
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.MeshHeadings | map([$id, (group_by(.Label) | .[] )])) | map([.[0], .[1][0].text, ([(.[2][]?.text )] | join("@")) ]) | .[] | @csv ' {} \; > ../doc_subject_pp.csv
//DOCUMENTS
create table document_pp(id text, title text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN', doctype text default 'publication');
copy document_pp (id, title, abstract, pubyear) from '/Users/claudio/workspace/data/d4i/document_pp.csv' CSV ;
create table document_pp_unique as (select distinct * from document_pp );
drop table document_pp;
alter table document_pp_unique rename to document_pp ;
update document_pp set repository = 'PubMed Central PP' ;
update document_pp set pubyear = to_date(pubyear, 'DD/MM/YYYY')::text ;
update document_pp set id= '50|pp_______267::'||MD5(id) ;
// DOC_AUTHOR
create table doc_author_pp (docid text, fullname text, rank integer);
copy doc_author_pp (docid, fullname, rank) from '/Users/claudio/workspace/data/d4i/doc_author_pp.csv' CSV ;
update doc_author_pp set fullname = SUBSTRING(fullname, 0, length(fullname) + 1 - 2) where fullname like '%, ';
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
drop table doc_author_pp;
alter table doc_author_pp_u rename to doc_author_pp ;
update doc_author_pp set docid = '50|pp_______267::'||MD5(docid) ;
// DOC_SUBJECT
create table doc_subject_pp(docid text, subject text, typology text);
create table subject_tmp(id text, descriptor text, qualifiers text);
copy subject_tmp (id, descriptor, qualifiers) from '/Users/claudio/workspace/data/d4i/doc_subject_pp.csv' CSV;
insert into doc_subject_pp select '50|pp_______267::'||MD5(id) as docid, s as subject, 'MeshHeadings' as typology from ( select id, d||'|'||q as s from ( select id, descriptor as d, unnest(regexp_split_to_array(qualifiers, '@')) as q from subject_tmp where qualifiers <> '') as t UNION ALL select distinct id, descriptor as s from subject_tmp) as t ;
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
drop table doc_subject_pp;
alter table doc_subject_pp_u rename to doc_subject_pp;
// DOC_PROJECT
create table doc_project_pp(docid text, projectid text);
create table dp_tmp (docid text, profix text, grantid text) ;
copy dp_tmp(docid, profix, grantid) from '/Users/claudio/workspace/data/d4i/doc_project_pp.csv' CSV;
insert into doc_project_pp select '50|pp_______267::'||MD5(docid), profix||MD5(grantid) from dp_tmp ;
// DOC_OTHER_IDENTIFIER
create table doc_other_identifier_pp(docid text, idtype text, id text);
copy doc_other_identifier_pp (docid, idtype, id) from '/Users/claudio/workspace/data/d4i/doc_other_id.csv' CSV;
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
update doc_other_identifier_pp set docid= '50|pp_______267::'||MD5(docid);
// Caricamento dei csv sul db, cleaning degli idtype, generazione dei subject (mesh), distinct values, ...
create table doc_alias_pp(id text, idpp text);
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
alter table document_pp add column existing_docid text;
alter table doc_other_identifier_pp add column existing_docid text;
alter table doc_author_pp add column existing_docid text;
alter table doc_project_pp add column existing_docid text;
alter table doc_subject_pp add column existing_docid text;
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
update document_pp set id = existing_docid where existing_docid is not null;
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
update doc_author_pp set docid = existing_docid where existing_docid is not null;
update doc_project_pp set docid = existing_docid where existing_docid is not null;
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
alter table document_pp drop column existing_docid ;
alter table doc_other_identifier_pp drop column existing_docid ;
alter table doc_author_pp drop column existing_docid ;
alter table doc_project_pp drop column existing_docid ;
alter table doc_subject_pp drop column existing_docid ;
-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp_______267::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp_______267::%' on conflict do nothing;
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;
----------------------------
-- extract RestPublications publications from the project portfolios
//--PART 1 - to be run in local
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | map([.key, .value.title, .value.resulttype, .value.description, .value.dateofacceptance]) | .[] | @csv' {} \; > ../document_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.creators | map([$id, .full, .rank])) | .[] | @csv' {} \; > ../doc_author_pp.csv
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.subjects | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_subject_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.pids | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_other_id.csv
//DOCUMENTS
drop table if exists document_pp;
create table document_pp(id text, title text, doctype text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN');
copy document_pp (id, title, doctype, abstract, pubyear) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/document_pp.csv' CSV ;
create table document_pp_unique as (select distinct * from document_pp );
drop table document_pp;
alter table document_pp_unique rename to document_pp ;
update document_pp set repository = 'Rest Publications PP' ;
update document_pp set id= '50|pp__restpubs::'||MD5(id);
// DOC_AUTHOR
drop table if exists doc_author_pp;
create table doc_author_pp (docid text, fullname text, rank integer);
copy doc_author_pp (docid, fullname, rank) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_author_pp.csv' CSV ;
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
drop table doc_author_pp;
alter table doc_author_pp_u rename to doc_author_pp ;
update doc_author_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
// DOC_PROJECT
drop table if exists doc_project_pp;
create table doc_project_pp(docid text, projectid text);
create table dp_tmp (docid text, prefix text, grantid text) ;
copy dp_tmp(docid, prefix, grantid) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_project_pp.csv' CSV;
insert into doc_project_pp select distinct '50|pp__restpubs::'||MD5(docid), prefix||MD5(grantid) from dp_tmp ;
// DOC_SUBJECT
drop table if exists doc_subject_pp;
create table doc_subject_pp(docid text, subject text, typology text);
copy doc_subject_pp (docid, subject, typology) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_subject_pp.csv' CSV;
delete from doc_subject_pp where subject is null OR subject = '';
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
drop table doc_subject_pp;
alter table doc_subject_pp_u rename to doc_subject_pp;
update doc_subject_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
// DOC_OTHER_IDENTIFIER
drop table if exists doc_other_identifier_pp;
create table doc_other_identifier_pp(docid text, id text, idtype text);
copy doc_other_identifier_pp (docid, id, idtype) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_other_id.csv' CSV;
delete from doc_other_identifier_pp where id is null OR id = '';
create table doc_other_identifier_pp_u as select distinct * from doc_other_identifier_pp;
drop table doc_other_identifier_pp;
alter table doc_other_identifier_pp_u rename to doc_other_identifier_pp;
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
update doc_other_identifier_pp set docid = '50|pp__restpubs::'||MD5(docid);
// -- PART 2 - to be run on the server
create table doc_alias_pp(id text, idpp text);
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
alter table document_pp add column existing_docid text;
alter table doc_other_identifier_pp add column existing_docid text;
alter table doc_author_pp add column existing_docid text;
alter table doc_project_pp add column existing_docid text;
alter table doc_subject_pp add column existing_docid text;
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
update document_pp set id = existing_docid where existing_docid is not null;
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
update doc_author_pp set docid = existing_docid where existing_docid is not null;
update doc_project_pp set docid = existing_docid where existing_docid is not null;
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
alter table document_pp drop column existing_docid ;
alter table doc_other_identifier_pp drop column existing_docid ;
alter table doc_author_pp drop column existing_docid ;
alter table doc_project_pp drop column existing_docid ;
alter table doc_subject_pp drop column existing_docid ;
-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select distinct id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp__restpubs::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp__restpubs::%' on conflict do nothing;
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;

View File

@ -0,0 +1,31 @@
#!/bin/bash
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
tmp="/tmp/tempfile.sql"
rm -f "$tmp"
echo "DELETE FROM project_portfolio;" >> "$tmp"
for f in `ls /data/d4i/project_portfolios/november2018/D4I_Analytics_ARC_Release04_WP52_31Nov2018_fixed/FP7_*.json`
do
id=$(jq .administrative_data.project_id "$f" | tr -d '"')
echo -n "INSERT INTO project_portfolio(projectid, portfolio) VALUES ('40|corda_______::'||MD5('$id'), '" >> "$tmp"
cat "$f" | gzip -c | base64 | tr -d '\n' >> "$tmp"
echo "');" >> "$tmp"
done
for f in `ls /data/d4i/project_portfolios/november2018/D4I_Analytics_ARC_Release04_WP52_31Nov2018_fixed/H2020_*.json`
do
id=$(jq .administrative_data.project_id "$f" | tr -d '"')
echo -n "INSERT INTO project_portfolio(projectid, portfolio) VALUES ('40|corda__h2020::'||MD5('$id'), '" >> "$tmp"
cat "$f" | gzip -c | base64 | tr -d '\n' >> "$tmp"
echo "');" >> "$tmp"
done
echo "Inserting file: $tmp"
#psql data4impact -f "$tmp"
IFS=$SAVEIFS

View File

@ -0,0 +1,199 @@
CREATE TABLE pp_metrics (
id text PRIMARY KEY,
eu_contribution numeric,
number_of_innovations integer,
number_of_companies_founded integer,
number_of_patents integer,
number_of_projects integer,
number_of_pubmed_publications integer,
number_of_rest_publications integer,
number_of_segments integer,
total_cost numeric
);
CREATE TABLE pp_countries_cooccurrences (
funding text REFERENCES pp_metrics(id),
country1 text,
country2 text,
number integer,
PRIMARY KEY (funding, country1, country2)
);
CREATE TABLE pp_eu_contribution_per_country (
funding text REFERENCES pp_metrics(id),
country text,
contribution numeric,
PRIMARY KEY (funding, country)
);
CREATE TABLE pp_eu_contribution_per_participant_sector (
funding text REFERENCES pp_metrics(id),
sector text,
contribution numeric,
PRIMARY KEY (funding, sector)
);
CREATE TABLE pp_eu_contribution_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
contribution numeric,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_eu_contribution_per_research_area_over_time (
funding text REFERENCES pp_metrics(id),
area text,
year integer,
contribution numeric,
PRIMARY KEY (funding, area, year)
);
CREATE TABLE pp_eu_contribution_per_year (
funding text REFERENCES pp_metrics(id),
year integer,
contribution numeric,
PRIMARY KEY (funding, year)
);
CREATE TABLE pp_number_of_innovations_per_type (
funding text REFERENCES pp_metrics(id),
type text,
number integer,
PRIMARY KEY (funding, type)
);
CREATE TABLE pp_number_of_innovations_per_type_per_country (
funding text REFERENCES pp_metrics(id),
type text,
country text,
number integer,
PRIMARY KEY (funding, type, country)
);
CREATE TABLE pp_number_of_innovations_per_type_per_research_area (
funding text REFERENCES pp_metrics(id),
type text,
area text,
number integer,
PRIMARY KEY (funding, type, area)
);
CREATE TABLE pp_number_of_patents_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
number integer,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_number_of_projects_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
number integer,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_number_of_pubmed_publications_per_country (
funding text REFERENCES pp_metrics(id),
country text,
number integer,
PRIMARY KEY (funding, country)
);
CREATE TABLE pp_number_of_pubmed_publications_per_journal (
funding text REFERENCES pp_metrics(id),
journal text,
number integer,
PRIMARY KEY (funding, journal)
);
CREATE TABLE pp_number_of_pubmed_publications_per_journal_per_research_area (
funding text REFERENCES pp_metrics(id),
journal text,
area text,
number integer,
PRIMARY KEY (funding, journal, area)
);
CREATE TABLE pp_number_of_pubmed_publications_per_journal_per_year (
funding text REFERENCES pp_metrics(id),
journal text,
year integer,
number integer,
PRIMARY KEY (funding, journal, year)
);
CREATE TABLE pp_number_of_pubmed_publications_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
number integer,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_number_of_pubmed_publications_per_year (
funding text REFERENCES pp_metrics(id),
year integer,
number integer,
PRIMARY KEY (funding, year)
);
-- IT IS EQUIVALENT TO pp_number_of_pubmed_publications_per_journal_per_year --
CREATE TABLE pp_number_of_pubmed_publications_per_year_per_journal (
funding text REFERENCES pp_metrics(id),
journal text,
year integer,
number integer,
PRIMARY KEY (funding, journal, year)
);
CREATE TABLE pp_number_of_rest_publications_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
number integer,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_number_of_rest_publications_per_year (
funding text REFERENCES pp_metrics(id),
year integer,
number integer,
PRIMARY KEY (funding, year)
);
CREATE TABLE pp_research_areas_cooccurrences (
funding text REFERENCES pp_metrics(id),
area1 text,
area2 text,
number integer,
PRIMARY KEY (funding, area1, area2)
);
CREATE TABLE pp_research_areas_to_icd10 (
funding text REFERENCES pp_metrics(id),
area text,
icd10 text,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_total_cost_per_research_area (
funding text REFERENCES pp_metrics(id),
area text,
cost numeric,
PRIMARY KEY (funding, area)
);
CREATE TABLE pp_total_cost_per_research_area_over_time (
funding text REFERENCES pp_metrics(id),
area text,
year integer,
cost numeric,
PRIMARY KEY (funding, area, year)
);
CREATE TABLE pp_total_cost_per_year (
funding text REFERENCES pp_metrics(id),
year integer,
cost numeric,
PRIMARY KEY (funding, year)
);

View File

@ -0,0 +1,194 @@
#!/bin/bash
file=/Users/claudio/workspace/data/d4i/november2018/D4I_Metrics_ARC_Release04_WP52_31Nov2018/statistics_on_release.json
db=metrics_tmp
echo "Recreating the database $db"
dropdb $db --if-exists
createdb $db
psql $db -f schema.sql
echo
echo "Importing table pp_metrics"
cat $file \
| jq -r 'to_entries | map([.key, .value.eu_contribution, .value.number_of_innovations, .value.number_of_companies_founded, .value.number_of_patents, .value.number_of_projects, .value.number_of_pubmed_publications, .value.number_of_rest_publications, .value.number_of_segments, .value.total_cost]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_metrics(id,eu_contribution,number_of_innovations,number_of_companies_founded,number_of_patents,number_of_projects,number_of_pubmed_publications,number_of_rest_publications,number_of_segments,total_cost) FROM STDIN CSV"
echo
echo "Importing table pp_countries_cooccurrences"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.countries_cooccurrences | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_countries_cooccurrences(funding,country1,country2,number) FROM STDIN CSV"
echo
echo "Importing table pp_eu_contribution_per_country"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_country | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_eu_contribution_per_country(funding,country,contribution) FROM STDIN CSV"
echo
echo "Importing table pp_eu_contribution_per_participant_sector"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_participant_sector | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_eu_contribution_per_participant_sector(funding,sector,contribution) FROM STDIN CSV"
echo
echo "Importing table pp_eu_contribution_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_eu_contribution_per_research_area(funding,area,contribution) FROM STDIN CSV"
echo
echo "Importing table pp_eu_contribution_per_research_area_over_time"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_research_area_over_time | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_eu_contribution_per_research_area_over_time(funding,year,area,contribution) FROM STDIN CSV"
echo
echo "Importing table pp_eu_contribution_per_year"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_eu_contribution_per_year(funding,year,contribution) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_innovations_per_type"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_innovations_per_type(funding,type,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_innovations_per_type_per_country"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type_per_country | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_innovations_per_type_per_country(funding,country,type,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_innovations_per_type_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type_per_research_area | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_innovations_per_type_per_research_area(funding,area,type,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_patents_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_patents_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_patents_per_research_area(funding,area,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_projects_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_projects_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_projects_per_research_area(funding,area,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_country"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_country | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_country(funding,country,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_journal"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal(funding,journal,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_journal_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal_per_research_area | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal_per_research_area(funding,journal,area,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_journal_per_year"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal_per_year | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal_per_year(funding,journal,year,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_research_area(funding,area,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_year"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_year(funding,year,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_pubmed_publications_per_year_per_journal"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_year_per_journal | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_pubmed_publications_per_year_per_journal(funding,year,journal,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_rest_publications_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_rest_publications_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_rest_publications_per_research_area(funding,area,number) FROM STDIN CSV"
echo
echo "Importing table pp_number_of_rest_publications_per_year"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.number_of_rest_publications_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_number_of_rest_publications_per_year(funding,year,number) FROM STDIN CSV"
echo
echo "Importing table pp_research_areas_cooccurrences"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.research_areas_cooccurrences | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_research_areas_cooccurrences(funding,area1,area2,number) FROM STDIN CSV"
echo
echo "Importing table pp_research_areas_to_icd10"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.research_areas_to_icd10 | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_research_areas_to_icd10(funding,area,icd10) FROM STDIN CSV"
echo
echo "Importing table pp_total_cost_per_research_area"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_total_cost_per_research_area(funding,area,cost) FROM STDIN CSV"
echo
echo "Importing table pp_total_cost_per_research_area_over_time"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_research_area_over_time | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_total_cost_per_research_area_over_time(funding,year,area,cost) FROM STDIN CSV"
echo
echo "Importing table pp_total_cost_per_year"
cat $file \
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
| sed -e 's/"null"/-1/' \
| psql $db -c "COPY pp_total_cost_per_year(funding,year,cost) FROM STDIN CSV"
echo

View File

@ -0,0 +1,3 @@
insert into doc_fulltext(docid, fulltext) select d.id as docid, t.fulltext as fulltext from document d left outer join temp_fulltext t on (d.id = t.pubid) where t.fulltext is not null;

View File

@ -0,0 +1,5 @@
DONE using a java application
cat pubmed.json | while read -r line; do echo $line | jq '.body["$binary"]' | sed 's/"//g' | base64 -d | gunzip -c; done

View File

@ -0,0 +1,7 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
token AS "id",
'doi' AS "type"
FROM projects p , unnest(string_to_array(p.doi_list, ',')) s(token)
WHERE token IS NOT NULL
) t) TO STDOUT;

View File

@ -0,0 +1,6 @@
COPY (SELECT row_to_json(t) FROM (SELECT distinct
'20|swedish_orgs::'||MD5(lower(organizations_coordinating_en)) AS "id",
organizations_coordinating_en AS "name",
'SE' AS "country"
FROM projects
) t) TO STDOUT;

View File

@ -0,0 +1,8 @@
COPY (SELECT row_to_json(t) FROM (
SELECT
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
token AS "docId",
'doi' AS "docIdType"
FROM projects p , unnest(string_to_array(p.doi_list, ',')) s(token)
WHERE token IS NOT NULL
) t) TO STDOUT;

View File

@ -0,0 +1,9 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
'20|swedish_orgs::'||MD5(lower(organizations_coordinating_en)) AS "orgId",
'coordinator' AS "role",
people_project_leaders_0_firstname AS "contactFirstNames",
people_project_leaders_0_surname AS "contactLastNames"
FROM projects
) t) TO STDOUT;

View File

@ -0,0 +1,6 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
dnr AS "id",
lower(organization_short)||':grant_id' AS "type"
FROM projects
) t) TO STDOUT;

View File

@ -0,0 +1,19 @@
COPY (SELECT row_to_json(t) FROM (SELECT
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "id",
title_en AS "title",
organization_short AS "funder",
type_of_awards AS "fundingLevel0",
dates_start_date AS "startDate",
dates_end_date AS "endDate",
abstract_en AS "abstractText",
tags_0_en AS "keywords",
total_funding AS "contribution",
'SEK'::text AS "currency"
FROM projects
) t) TO STDOUT;
-- intrascientific_report_en text,
-- popular_report_sv text,
-- doi_list text,
-- total_funding text

View File

@ -0,0 +1,22 @@
CREATE TABLE projects (
swecris_info text,
doi text,
final_reports text,
Organization_short text,
Organization_long text,
dnr text,
people_project_leaders_0_surname text,
people_project_leaders_0_firstname text,
organizations_coordinating_en text,
type_of_awards text,
dates_start_date text,
dates_end_date text,
title_en text,
abstract_en text,
intrascientific_report_en text,
popular_report_sv text,
tags_0_en text,
doi_list text,
total_funding numeric
);

View File

@ -0,0 +1,29 @@
#!/bin/bash
csv=/tmp/180626-swe_proj_data-delivery.csv
inputCsvFile="$(cd "$(dirname "$csv")"; pwd -P)/$(basename "$csv")"
echo
echo "Swedish Projects Import:"
#--------------------------------
echo " - Recreating the swedishprojects database"
dropdb swedishprojects --if-exists;
createdb swedishprojects;
psql swedishprojects -f schema.sql
psql swedishprojects -c "COPY projects(swecris_info, doi, final_reports, Organization_short, Organization_long, dnr, people_project_leaders_0_surname, people_project_leaders_0_firstname, organizations_coordinating_en, type_of_awards, dates_start_date, dates_end_date, title_en, abstract_en, intrascientific_report_en, popular_report_sv, tags_0_en, doi_list, total_funding) FROM '$inputCsvFile' DELIMITER ',' CSV HEADER;"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/swedishProjects/*.json
psql swedishprojects -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/project.json
psql swedishprojects -f orgs2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/organization.json
psql swedishprojects -f projOrg2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectOrganization.json
psql swedishprojects -f projOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectOtherId.json
psql swedishprojects -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/docotherid.json
psql swedishprojects -f projDoi2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectdocotherid.json
echo "Done."
echo

View File

@ -0,0 +1,93 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib</groupId>
<artifactId>data4impact-importer</artifactId>
<version>1.1.0-SNAPSHOT</version>
<!-- <scm> <developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/data4impact/data4impact-importer/trunk</developerConnection>
</scm> <ciManagement> <system>jenkins</system> <url>https://jenkins-dnet.d4science.org/view/data4impact/job/data4impact-importer/</url>
</ciManagement> <distributionManagement> <repository> <id>dnet45-releases</id>
<name>D-Net 45 Releases</name> <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout> </repository> </distributionManagement> -->
<!-- Inherit defaults from Spring Boot -->
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.3.RELEASE</version>
<relativePath></relativePath>
</parent>
<!-- <repositories> <repository> <id>dnet-deps</id> <name>dnet-dependencies</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
<layout>default</layout> </repository> <repository> <id>dnet45-releases</id>
<name>D-Net 45 Releases</name> <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout> <snapshots> <enabled>true</enabled> </snapshots>
</repository> <repository> <id>dnet45-snapshots</id> <name>D-Net 45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout> <snapshots> <enabled>true</enabled> </snapshots>
</repository> </repositories> -->
<!-- Add typical dependencies for a web application -->
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>data4impact-model</artifactId>
<version>1.1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.11</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<!-- JUnit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<executable>true</executable>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<java.version>1.8</java.version>
<apache.solr.version>7.1.0</apache.solr.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<springfox-version>2.8.0</springfox-version>
<prometheus.version>0.2.0</prometheus.version>
<javamelody.version>1.71.0</javamelody.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
</properties>
</project>

View File

@ -0,0 +1,88 @@
package eu.data4impact;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.Optional;
import javax.persistence.EntityManagerFactory;
import javax.transaction.Transactional;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Component;
import com.fasterxml.jackson.databind.ObjectMapper;
@Component
public class Data4ImpactImporter {
@Autowired
private ApplicationContext applicationContext;
@Autowired
private EntityManagerFactory entityManagerFactory;
private final ObjectMapper jsonMapper = new ObjectMapper();
@Transactional
public <T> void importFileJson(final Path file, final Class<T> tableClass) {
try {
final LocalDateTime start = LocalDateTime.now();
final JpaRepository<T, ?> repo = findRepositorForTable(tableClass);
Files.lines(file, StandardCharsets.UTF_8).forEach(l -> processLine(l, tableClass, repo));
final LocalDateTime end = LocalDateTime.now();
final double time = Duration.between(start, end).toNanos() / 1000000000.0;
System.out.printf("\nDone in %.3f sec.\n\n", time);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
@SuppressWarnings("unchecked")
private <T, K> void processLine(final String line, final Class<T> tableClass, final JpaRepository<T, K> repo) {
try {
final T obj = jsonMapper.readValue(line, tableClass);
final K id = (K) entityManagerFactory.getPersistenceUnitUtil().getIdentifier(obj);
processObject(obj, id, repo);
} catch (final IOException | IllegalAccessException | InstantiationException e) {
throw new RuntimeException(e);
}
}
private <T, K> void processObject(final T obj, final K id, final JpaRepository<T, K> repo) throws IllegalAccessException, InstantiationException {
System.out.println(id);
final Optional<T> old = repo.findById(id);
if (old.isPresent()) {
repo.save(ObjectMerger.mergeObjects(old.get(), obj));
} else {
repo.save(obj);
}
}
@Transactional
public <T> void importFileXML(final String file, final Class<?> tableClass) {
throw new RuntimeException("-- NOT IMPLEMENTED --");
}
@SuppressWarnings("unchecked")
private <T, K> JpaRepository<T, K> findRepositorForTable(final Class<T> clazz) {
final String repoName = clazz.getSimpleName() + "Repository";
return applicationContext.getBeansOfType(JpaRepository.class)
.entrySet()
.stream()
.filter(e -> e.getKey().equalsIgnoreCase(repoName))
.map(e -> e.getValue())
.findFirst()
.orElseThrow(() -> new RuntimeException("No repository found for class " + clazz.getName()));
}
}

View File

@ -0,0 +1,99 @@
package eu.data4impact;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import javax.persistence.Table;
import org.reflections.Reflections;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import eu.data4impact.utils.DatabaseUtils;
@SpringBootApplication
public class Data4ImpactImporterApplication implements CommandLineRunner {
// private static final Logger log = LoggerFactory.getLogger(Data4ImpactImporterApplication.class);
@Autowired
private Data4ImpactImporter importer;
@Autowired
private DatabaseUtils databaseUtils;
public static void main(final String... args) {
SpringApplication.run(Data4ImpactImporterApplication.class, args);
}
@Override
public void run(final String... args) {
final Map<String, Class<?>> validEntities = validEntities();
if (args.length == 0) {
printHelp();
printValidFiles(validEntities);
System.exit(1);
}
for (final String f : args) {
if (f.toLowerCase().endsWith(".json")) {
System.out.println("Processing file: " + f);
final Path path = Paths.get(f);
final String fileName = path.getFileName().toString();
final String entityName = fileName.substring(0, fileName.lastIndexOf('.')).toLowerCase();
if (validEntities.containsKey(entityName)) {
importer.importFileJson(path, validEntities.get(entityName));
} else {
System.err.println("\n[ERROR] Entity not found for file " + f);
printValidFiles(validEntities);
System.exit(-1);
}
} else {
System.err.println("\nNot a json file: " + f);
printValidFiles(validEntities);
System.exit(-1);
}
}
System.out.println("Refreshing views...");
databaseUtils.refreshMaterializedViews(v -> System.out.println(" - " + v));
System.out.println("Done.\n");
}
private void printHelp() {
System.out.println();
System.out.println("Missing input files !");
System.out.println();
System.out.println("Example: java -jar file1.json file2.json ...");
System.out.println();
}
private void printValidFiles(final Map<String, Class<?>> validEntities) {
System.out.println("\nValid filenames are (ignore case):\n" +
validEntities.keySet()
.stream()
.collect(Collectors.groupingBy(validEntities::get))
.entrySet()
.stream()
.map(e -> String.format(" - For class %s: %s\n",
e.getKey().getSimpleName(),
e.getValue().stream().collect(Collectors.joining(".json, ")) + ".json"))
.collect(Collectors.joining()));
}
private Map<String, Class<?>> validEntities() {
final Map<String, Class<?>> res = new HashMap<>();
for (final Class<?> cl : new Reflections("eu.data4impact.model").getTypesAnnotatedWith(Table.class)) {
res.put(cl.getSimpleName().toLowerCase(), cl);
res.put(cl.getAnnotation(Table.class).name().toLowerCase(), cl);
}
return res;
}
}

View File

@ -0,0 +1,23 @@
package eu.data4impact;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
public class ObjectMerger {
@SuppressWarnings("unchecked")
public static <T> T mergeObjects(final T first, final T second) throws IllegalAccessException, InstantiationException {
final Class<?> clazz = first.getClass();
final Field[] fields = clazz.getDeclaredFields();
final T res = (T) clazz.newInstance();
for (final Field f : fields) {
if (!Modifier.isFinal(f.getModifiers())) {
f.setAccessible(true);
final Object v1 = f.get(first);
final Object v2 = f.get(second);
f.set(res, (v2 != null ? v2 : v1));
}
}
return res;
}
}

View File

@ -0,0 +1,14 @@
spring.main.banner-mode = off
logging.level.root = WARN
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
spring.datasource.username=
spring.datasource.password=
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
# Hibernate ddl auto (create, create-drop, validate, update)
spring.jpa.hibernate.ddl-auto = validate
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
spring.jpa.open-in-view=true

View File

@ -0,0 +1,26 @@
package eu.data4impact;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
import eu.data4impact.model.projects.Project;
public class ObjectMergerTest {
@Test
public void test() throws IllegalAccessException, InstantiationException {
final Project p1 = new Project();
final Project p2 = new Project();
p2.setEcSc39(true);
final Project p3 = ObjectMerger.mergeObjects(p1, p2);
final Project p4 = ObjectMerger.mergeObjects(p2, p1);
assertTrue(p3.getEcSc39());
assertTrue(p4.getEcSc39());
}
}

View File

@ -0,0 +1,14 @@
spring.main.banner-mode = off
logging.level.root = WARN
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
spring.datasource.username=
spring.datasource.password=
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
# Hibernate ddl auto (create, create-drop, validate, update)
spring.jpa.hibernate.ddl-auto = validate
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
spring.jpa.open-in-view=true

Some files were not shown because too many files have changed in this diff Show More