re-packing all the D4I applications
This commit is contained in:
parent
eaefc4d6d9
commit
2905051469
|
@ -0,0 +1,119 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>data4impact-api-application</artifactId>
|
||||||
|
<version>1.1.0-SNAPSHOT</version>
|
||||||
|
<!-- <scm>
|
||||||
|
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/data4impact/data4impact-api-application/trunk</developerConnection>
|
||||||
|
</scm>
|
||||||
|
<ciManagement>
|
||||||
|
<system>jenkins</system>
|
||||||
|
<url>https://jenkins-dnet.d4science.org/view/data4impact/job/data4impact-api-application/</url>
|
||||||
|
</ciManagement>
|
||||||
|
<distributionManagement>
|
||||||
|
<repository>
|
||||||
|
<id>dnet45-releases</id>
|
||||||
|
<name>D-Net 45 Releases</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
|
-->
|
||||||
|
<!-- Inherit defaults from Spring Boot -->
|
||||||
|
<parent>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-parent</artifactId>
|
||||||
|
<version>2.0.3.RELEASE</version>
|
||||||
|
<relativePath></relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<repositories>
|
||||||
|
<repository>
|
||||||
|
<id>dnet-deps</id>
|
||||||
|
<name>dnet-dependencies</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
</repository>
|
||||||
|
<repository>
|
||||||
|
<id>dnet45-releases</id>
|
||||||
|
<name>D-Net 45 Releases</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</snapshots>
|
||||||
|
</repository>
|
||||||
|
<repository>
|
||||||
|
<id>dnet45-snapshots</id>
|
||||||
|
<name>D-Net 45 Snapshots</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</snapshots>
|
||||||
|
</repository>
|
||||||
|
</repositories>
|
||||||
|
-->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Add typical dependencies for a web application -->
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-web</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>data4impact-model</artifactId>
|
||||||
|
<version>1.1.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- Swagger -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.springfox</groupId>
|
||||||
|
<artifactId>springfox-swagger2</artifactId>
|
||||||
|
<version>2.9.2</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.springfox</groupId>
|
||||||
|
<artifactId>springfox-swagger-ui</artifactId>
|
||||||
|
<version>2.9.2</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- JUnit -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<executable>true</executable>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<java.version>1.8</java.version>
|
||||||
|
<apache.solr.version>7.1.0</apache.solr.version>
|
||||||
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
|
<springfox-version>2.8.0</springfox-version>
|
||||||
|
<prometheus.version>0.2.0</prometheus.version>
|
||||||
|
<javamelody.version>1.71.0</javamelody.version>
|
||||||
|
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||||
|
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
|
||||||
|
</properties>
|
||||||
|
</project>
|
|
@ -0,0 +1,47 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.boot.SpringApplication;
|
||||||
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
|
import org.springframework.cache.annotation.EnableCaching;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
|
||||||
|
import springfox.documentation.builders.ApiInfoBuilder;
|
||||||
|
import springfox.documentation.builders.RequestHandlerSelectors;
|
||||||
|
import springfox.documentation.service.ApiInfo;
|
||||||
|
import springfox.documentation.spi.DocumentationType;
|
||||||
|
import springfox.documentation.spring.web.plugins.Docket;
|
||||||
|
import springfox.documentation.swagger2.annotations.EnableSwagger2;
|
||||||
|
|
||||||
|
@SpringBootApplication
|
||||||
|
@EnableSwagger2
|
||||||
|
@EnableCaching
|
||||||
|
public class MainApplication {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MainApplication.class);
|
||||||
|
|
||||||
|
public static void main(final String[] args) {
|
||||||
|
SpringApplication.run(MainApplication.class, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public static Docket newSwaggerDocket() {
|
||||||
|
log.info("Initializing SWAGGER...");
|
||||||
|
|
||||||
|
return new Docket(DocumentationType.SWAGGER_2)
|
||||||
|
.select()
|
||||||
|
.apis(RequestHandlerSelectors.any())
|
||||||
|
.paths(p -> p.startsWith("/api/"))
|
||||||
|
.build().apiInfo((new ApiInfoBuilder())
|
||||||
|
.title("Data4impact Service APIs")
|
||||||
|
.description("APIs documentation")
|
||||||
|
.version("1.1")
|
||||||
|
.contact(ApiInfo.DEFAULT_CONTACT)
|
||||||
|
.license("Apache 2.0")
|
||||||
|
.licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
|
||||||
|
.build());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Controller;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
|
||||||
|
@Controller
|
||||||
|
public class SwaggerController {
|
||||||
|
|
||||||
|
@RequestMapping(value = { "/", "/apidoc", "/api-doc", "/doc", "/swagger" }, method = RequestMethod.GET)
|
||||||
|
public String apiDoc() {
|
||||||
|
return "redirect:swagger-ui.html";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
|
||||||
|
import eu.data4impact.utils.MainEntity;
|
||||||
|
|
||||||
|
public abstract class AbstractJpaController<T extends MainEntity> {
|
||||||
|
|
||||||
|
public abstract JpaRepository<T, String> getRepo();
|
||||||
|
|
||||||
|
@RequestMapping(value = "/list/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public final List<T> find(@PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return getRepo().findAll(PageRequest.of(page, size)).getContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/identifiers/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public final List<String> findIdentifiers(@PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return find(page, size).stream().map(MainEntity::getId).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/get", method = RequestMethod.GET)
|
||||||
|
public final T get(@RequestParam final String id) {
|
||||||
|
return getRepo().findById(id).orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/count", method = RequestMethod.GET)
|
||||||
|
public final long count() {
|
||||||
|
return getRepo().count();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
|
||||||
|
import eu.data4impact.utils.MainEntity;
|
||||||
|
import eu.data4impact.utils.ReadOnlyRepository;
|
||||||
|
|
||||||
|
public abstract class AbstractReadOnlyController<T extends MainEntity> {
|
||||||
|
|
||||||
|
public abstract ReadOnlyRepository<T, String> getRepo();
|
||||||
|
|
||||||
|
@RequestMapping(value = "/list/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public final List<T> find(@PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return getRepo().findAll(PageRequest.of(page, size)).getContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/identifiers/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public final List<String> findIdentifiers(@PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return find(page, size).stream().map(MainEntity::getId).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/get", method = RequestMethod.GET)
|
||||||
|
public final T get(@RequestParam final String id) {
|
||||||
|
return getRepo().findById(id).orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/count", method = RequestMethod.GET)
|
||||||
|
public final long count() {
|
||||||
|
return getRepo().count();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.cache.annotation.CacheEvict;
|
||||||
|
import org.springframework.cache.annotation.Cacheable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.utils.Counter;
|
||||||
|
import eu.data4impact.utils.DatabaseUtils;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/admin")
|
||||||
|
public class AdminController {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private DatabaseUtils databaseUtils;
|
||||||
|
|
||||||
|
@RequestMapping(value = "/materializedViews", method = RequestMethod.GET)
|
||||||
|
public List<String> materializedViews(@RequestParam(required = false, defaultValue = "false") final boolean refresh) {
|
||||||
|
return refresh ? databaseUtils.refreshMaterializedViews() : databaseUtils.materializedViews();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Cacheable(value = "simpleCache", key = "'tables'")
|
||||||
|
@RequestMapping(value = "/tables", method = RequestMethod.GET)
|
||||||
|
public List<Counter> tables() {
|
||||||
|
return databaseUtils.tableSizes();
|
||||||
|
}
|
||||||
|
|
||||||
|
@CacheEvict(cacheNames = { "simpleCache" }, allEntries = true)
|
||||||
|
@RequestMapping(value = "/clearCaches", method = RequestMethod.GET)
|
||||||
|
public List<String> clearCaches() {
|
||||||
|
return Arrays.asList("Done.");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.model.documents.DocFulltext;
|
||||||
|
import eu.data4impact.model.documents.Document;
|
||||||
|
import eu.data4impact.repository.DocFulltextRepository;
|
||||||
|
import eu.data4impact.repository.DocumentRepository;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/docs")
|
||||||
|
public class DocumentController extends AbstractJpaController<Document> {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private DocumentRepository documentRepository;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private DocFulltextRepository docFulltextRepository;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public JpaRepository<Document, String> getRepo() {
|
||||||
|
return documentRepository;
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/fulltext", method = RequestMethod.GET, produces = "text/plain")
|
||||||
|
public String fulltext(@RequestParam final String id) {
|
||||||
|
return docFulltextRepository.findById(id).map(DocFulltext::getFulltext).orElse("");
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/byType/{type}/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public List<Document> findByType(@PathVariable final String type, @PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return documentRepository.findByType(type, PageRequest.of(page, size)).getContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/types", method = RequestMethod.GET)
|
||||||
|
public Map<String, Long> types() {
|
||||||
|
return documentRepository.types().stream().collect(Collectors.toMap(s -> s, s -> documentRepository.countByType(s)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.model.journals.Journal;
|
||||||
|
import eu.data4impact.repository.JournalRepository;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/journals")
|
||||||
|
public class JournalController extends AbstractJpaController<Journal> {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private JournalRepository journalRepository;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public JpaRepository<Journal, String> getRepo() {
|
||||||
|
return journalRepository;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,43 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.repository.readonly.OrganizationViewRepository;
|
||||||
|
import eu.data4impact.utils.ReadOnlyRepository;
|
||||||
|
import eu.data4impact.views.OrganizationView;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/organizations")
|
||||||
|
public class OrganizationController extends AbstractReadOnlyController<OrganizationView> {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private OrganizationViewRepository organizationViewRepository;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ReadOnlyRepository<OrganizationView, String> getRepo() {
|
||||||
|
return organizationViewRepository;
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/companies/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public List<OrganizationView> findCompanies(@PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return organizationViewRepository.findByCompany(true, PageRequest.of(page, size)).getContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/summary", method = RequestMethod.GET)
|
||||||
|
public Map<String, Long> summary() {
|
||||||
|
final Map<String, Long> res = new LinkedHashMap<>();
|
||||||
|
res.put("all", organizationViewRepository.count());
|
||||||
|
res.put("companies", organizationViewRepository.countByCompany(true));
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,76 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.postgresql.util.Base64;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.data.domain.PageRequest;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.model.projects.ProjectPortfolio;
|
||||||
|
import eu.data4impact.repository.ProjectPortfolioRepository;
|
||||||
|
import eu.data4impact.repository.readonly.ProjectViewRepository;
|
||||||
|
import eu.data4impact.utils.ReadOnlyRepository;
|
||||||
|
import eu.data4impact.views.ProjectView;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/projects")
|
||||||
|
public class ProjectController extends AbstractReadOnlyController<ProjectView> {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private ProjectViewRepository projectRepository;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private ProjectPortfolioRepository projectPortfolioRepository;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ReadOnlyRepository<ProjectView, String> getRepo() {
|
||||||
|
return projectRepository;
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/byFunder/{funder}/{page}/{size}", method = RequestMethod.GET)
|
||||||
|
public List<ProjectView> findByFunder(@PathVariable final String funder, @PathVariable final int page, @PathVariable final int size) {
|
||||||
|
return projectRepository.findByFunder(funder, PageRequest.of(page, size)).getContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/funders", method = RequestMethod.GET)
|
||||||
|
public Map<String, Long> funders() {
|
||||||
|
return projectRepository.funders().stream().collect(Collectors.toMap(s -> s, s -> projectRepository.countByFunder(s)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/portfolio", method = RequestMethod.GET)
|
||||||
|
public final void getPortfolio(@RequestParam final String id, final HttpServletResponse res) throws IOException {
|
||||||
|
res.setContentType("application/json");
|
||||||
|
|
||||||
|
IOUtils.write(projectPortfolioRepository.findById(id)
|
||||||
|
.map(ProjectPortfolio::getPortfolio)
|
||||||
|
.map(Base64::decode)
|
||||||
|
.map(this::gunzip)
|
||||||
|
.orElse("{}"),
|
||||||
|
res.getOutputStream(),
|
||||||
|
Charset.defaultCharset());
|
||||||
|
}
|
||||||
|
|
||||||
|
private String gunzip(final byte[] bytes) {
|
||||||
|
try (final ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
|
||||||
|
final GZIPInputStream gis = new GZIPInputStream(bis)) {
|
||||||
|
return IOUtils.toString(gis, Charset.defaultCharset());
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
package eu.data4impact.controller;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
import eu.data4impact.model.topics.Topic;
|
||||||
|
import eu.data4impact.repository.TopicRepository;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/topics")
|
||||||
|
public class TopicController extends AbstractJpaController<Topic> {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private TopicRepository topicRepository;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public JpaRepository<Topic, String> getRepo() {
|
||||||
|
return topicRepository;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
spring.main.banner-mode = off
|
||||||
|
logging.level.root = INFO
|
||||||
|
|
||||||
|
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
|
||||||
|
spring.datasource.username=
|
||||||
|
spring.datasource.password=
|
||||||
|
|
||||||
|
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
|
||||||
|
|
||||||
|
# Hibernate ddl auto (create, create-drop, validate, update)
|
||||||
|
spring.jpa.hibernate.ddl-auto = validate
|
||||||
|
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
|
||||||
|
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
|
||||||
|
spring.jpa.open-in-view=true
|
|
@ -0,0 +1,14 @@
|
||||||
|
spring.main.banner-mode = off
|
||||||
|
logging.level.root = INFO
|
||||||
|
|
||||||
|
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
|
||||||
|
spring.datasource.username=
|
||||||
|
spring.datasource.password=
|
||||||
|
|
||||||
|
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
|
||||||
|
|
||||||
|
# Hibernate ddl auto (create, create-drop, validate, update)
|
||||||
|
spring.jpa.hibernate.ddl-auto = validate
|
||||||
|
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
|
||||||
|
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
|
||||||
|
spring.jpa.open-in-view=true
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,5 @@
|
||||||
|
eu/data4impact/controller/AbstractJpaController.class
|
||||||
|
eu/data4impact/controller/JournalController.class
|
||||||
|
eu/data4impact/controller/OrganizationController.class
|
||||||
|
eu/data4impact/controller/AbstractReadOnlyController.class
|
||||||
|
eu/data4impact/controller/AdminController.class
|
|
@ -0,0 +1,10 @@
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/OrganizationController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AbstractReadOnlyController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AdminController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/JournalController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/MainApplication.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/TopicController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/ProjectController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/AbstractJpaController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/SwaggerController.java
|
||||||
|
/Users/claudio/workspace/git/data4impact/apps/data4impact-api-application/src/main/java/eu/data4impact/controller/DocumentController.java
|
|
@ -0,0 +1,3 @@
|
||||||
|
UPDATE public.Document
|
||||||
|
SET batchid = SUBSTR(pubyear,0,5) ;
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
-- Population of the doc_project relation using the data from project -> project_doc_other_id -> doc_other_identifier -> document
|
||||||
|
|
||||||
|
insert into
|
||||||
|
doc_project(projectid, docid, inferred)
|
||||||
|
select
|
||||||
|
p.projectid,
|
||||||
|
d.docid,
|
||||||
|
true as inferred
|
||||||
|
from
|
||||||
|
project_doc_other_id p
|
||||||
|
left outer join doc_other_identifier d on (p.docid = d.id and p.docidtype = d.idtype)
|
||||||
|
where
|
||||||
|
d.docid is not null
|
||||||
|
on conflict do nothing;
|
||||||
|
|
||||||
|
|
||||||
|
-- Population of the doc_doc relation using the data from document(eg: guidelines) -> doc_doc_other_id -> doc_other_identifier -> document(eg: publication)
|
||||||
|
|
||||||
|
insert into
|
||||||
|
doc_doc(docid1, docid2, reltype, inferred)
|
||||||
|
select
|
||||||
|
d.docid1 as docid1,
|
||||||
|
i.docid as docid2,
|
||||||
|
d.reltype as reltype,
|
||||||
|
true as inferred
|
||||||
|
from
|
||||||
|
doc_doc_other_id d
|
||||||
|
left outer join doc_other_identifier i on (d.docid2 = i.id and d.docid2type = i.idtype)
|
||||||
|
where
|
||||||
|
i.docid is not null
|
||||||
|
on conflict do nothing;
|
||||||
|
|
||||||
|
-- Remove redundant doc_project relations (references to MOCK PROJECTS would be counted twice, otherwise)
|
||||||
|
|
||||||
|
create table temp_delete_doc_project as select t.docid||'@'||t.projectid as item from (
|
||||||
|
select
|
||||||
|
dp.docid,
|
||||||
|
unnest(array_agg(dp.projectid)) as projectid
|
||||||
|
from
|
||||||
|
doc_project dp
|
||||||
|
left outer join project p on (dp.projectid = p.id)
|
||||||
|
group by
|
||||||
|
dp.docid, p.funder
|
||||||
|
having
|
||||||
|
array_to_string(array_agg(dp.projectid), ',', '') like '%MOCK_PROJECT%'
|
||||||
|
and array_length(array_agg(DISTINCT dp.projectid), 1) > 1
|
||||||
|
) as t
|
||||||
|
where t.projectid like '40|MOCK_PROJECT::%';
|
||||||
|
|
||||||
|
create index temp_delete_doc_project_item_idx on temp_delete_doc_project(item);
|
||||||
|
|
||||||
|
delete from doc_project where docid||'@'||projectid in (
|
||||||
|
select item
|
||||||
|
from temp_delete_doc_project
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
drop table temp_delete_doc_project;
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
java -jar /Users/michele/.m2/repository/eu/dnetlib/data4impact-importer/1.1.0-SNAPSHOT/data4impact-importer-1.1.0-SNAPSHOT.jar \
|
||||||
|
./jsonfiles/swedishProjects/project.json \
|
||||||
|
./jsonfiles/swedishProjects/projectOtherId.json \
|
||||||
|
./jsonfiles/swedishProjects/organization.json \
|
||||||
|
./jsonfiles/swedishProjects/projectOrganization.json \
|
||||||
|
./jsonfiles/swedishProjects/docotherid.json \
|
||||||
|
./jsonfiles/swedishProjects/projectdocotherid.json \
|
||||||
|
./jsonfiles/ecProjectsOpenaire/project.json \
|
||||||
|
./jsonfiles/ecProjectsOpenaire/projectOtherId.json \
|
||||||
|
./jsonfiles/ecProjectsOpenaire/organization.json \
|
||||||
|
./jsonfiles/ecProjectsOpenaire/organizationOtherId.json \
|
||||||
|
./jsonfiles/ecProjectsOpenaire/projectOrganization.json \
|
||||||
|
./jsonfiles/cordis/project.json \
|
||||||
|
./jsonfiles/cordis/projectOtherId.json \
|
||||||
|
./jsonfiles/cordis/organization.json \
|
||||||
|
./jsonfiles/cordis/organizationOtherId.json \
|
||||||
|
./jsonfiles/cordis/projectOrganization.json \
|
||||||
|
./jsonfiles/companydata/orgCompanyMetrics.json \
|
||||||
|
./jsonfiles/funderdata/project.json \
|
||||||
|
./jsonfiles/funderdata/docotherid.json \
|
||||||
|
./jsonfiles/funderdata/projectdocotherid.json \
|
||||||
|
./jsonfiles/patents/document.json \
|
||||||
|
./jsonfiles/patents/doc_fulltext.json \
|
||||||
|
./jsonfiles/patents/doc_other_identifier.json \
|
||||||
|
./jsonfiles/guidelines/document.json \
|
||||||
|
./jsonfiles/guidelines/docotherid.json
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
# MANUAL STEPS FOR clinical trials
|
||||||
|
|
||||||
|
1) cd /data/ftp/d4i/clinical_trials
|
||||||
|
|
||||||
|
2) Recreate the table in the DB using
|
||||||
|
|
||||||
|
DROP TABLE clinical_trials;
|
||||||
|
|
||||||
|
CREATE TABLE clinical_trials (
|
||||||
|
doi text,
|
||||||
|
trial_number text,
|
||||||
|
trial_registry text
|
||||||
|
);
|
||||||
|
|
||||||
|
4) Insert data:
|
||||||
|
|
||||||
|
COPY clinical_trials(doi,trial_number,trial_registry) FROM '/data/ftp/d4i/clinical_trials/clintrial.txt' DELIMITER E'\t';
|
||||||
|
DELETE FROM clinical_trials where doi = 'pub-with-clin-trial';
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
detailsFile=../../orig/CompanyData/D4I_companies_summary.txt
|
||||||
|
|
||||||
|
workdir=/tmp/companydata
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "CompanyData Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv files"
|
||||||
|
csvDetails="$workdir/details.csv"
|
||||||
|
cat $detailsFile | jq 'to_entries' | jq 'map([.key, .value."data gathered?", .value."tangible + pre_market", .value."tangible + market", .value."intangible + pre_market", .value."intangible + market", .value."innovation?"])' | jq .[] | jq -r @csv > $csvDetails
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the companydata database"
|
||||||
|
dropdb companydata --if-exists;
|
||||||
|
createdb companydata;
|
||||||
|
psql companydata -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csvDetails" ]]; then
|
||||||
|
echo " - Importing details: $csvDetails"
|
||||||
|
psql companydata -c "COPY companymetrics(orgid, data_gathered, tangible_pre_market, tangible_market, intangible_pre_market, intangible_market, innovation) FROM '$csvDetails' CSV;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csvDetails"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/companydata/*.json
|
||||||
|
psql companydata -f metrics2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/companydata/orgCompanyMetrics.json
|
||||||
|
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
detailsFile=../../orig/CompanyData/D4I_company_innovation_texts.json
|
||||||
|
|
||||||
|
workdir=/tmp/companydata_texts
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "CompanyData Innovation texts Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv files"
|
||||||
|
csvDetails="$workdir/details.csv"
|
||||||
|
cat $detailsFile | jq --slurp -r '(map(keys) | add | unique) as $cols | map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' > $csvDetails
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the companydata_texts database"
|
||||||
|
dropdb companydata_texts --if-exists;
|
||||||
|
createdb companydata_texts;
|
||||||
|
psql companydata_texts -f schema_texts.sql
|
||||||
|
|
||||||
|
if [[ -f "$csvDetails" ]]; then
|
||||||
|
echo " - Importing details: $csvDetails"
|
||||||
|
psql companydata_texts -c "COPY data(company_id,prediction_revised,site_url,source,text_clean_gentle,text_clean_strong,text_is_duplicated) FROM '$csvDetails' CSV HEADER;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csvDetails"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/companydata_texts/*.json
|
||||||
|
psql companydata_texts -f innovationTexts2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/companydata_texts/orgCompanyInnovationTexts.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'20|ec__________::'||MD5(company_id) AS "orgId",
|
||||||
|
prediction_revised AS "predictionRevised",
|
||||||
|
site_url AS "siteUrl",
|
||||||
|
source AS "source",
|
||||||
|
text_clean_gentle AS "textCleanGentle",
|
||||||
|
text_clean_strong AS "textCleanStrong",
|
||||||
|
text_is_duplicated AS "duplicated"
|
||||||
|
FROM data
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,15 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'20|ec__________::'||MD5(orgid) AS "orgId",
|
||||||
|
(LOWER(data_gathered)='yes') AS "dataGathered",
|
||||||
|
tangible_pre_market AS "tangiblePreMarket",
|
||||||
|
tangible_market AS "tangibleMarket",
|
||||||
|
intangible_pre_market AS "intangiblePreMarket",
|
||||||
|
intangible_market AS "intangibleMarket",
|
||||||
|
CASE
|
||||||
|
WHEN innovation='0' THEN false
|
||||||
|
WHEN innovation='1' THEN true
|
||||||
|
ELSE NULL
|
||||||
|
END AS "innovation"
|
||||||
|
FROM companymetrics
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,11 @@
|
||||||
|
CREATE TABLE companymetrics (
|
||||||
|
orgid text,
|
||||||
|
data_gathered varchar(5),
|
||||||
|
tangible_pre_market int,
|
||||||
|
tangible_market int,
|
||||||
|
intangible_pre_market int,
|
||||||
|
intangible_market int,
|
||||||
|
innovation varchar(5)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
CREATE TABLE data (
|
||||||
|
company_id text,
|
||||||
|
prediction_revised float,
|
||||||
|
site_url text,
|
||||||
|
source text,
|
||||||
|
text_clean_gentle text,
|
||||||
|
text_clean_strong text,
|
||||||
|
text_is_duplicated boolean
|
||||||
|
);
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
Paolo, Vilius, all
|
||||||
|
|
||||||
|
Last week I was in London attending a "special" event for publishers and I
|
||||||
|
had the opportunity to meet a guy from the Strategic Initiatives dep. of
|
||||||
|
Crossref who pointed me out the events API
|
||||||
|
(https://www.crossref.org/services/event-data/). Such API links
|
||||||
|
publications to several external sources including Patents, Twitter,
|
||||||
|
Wikipedia, Reddit, StackExchange, Wordpress etc.
|
||||||
|
Running some queries on their db we saw that for Twitter they do have data
|
||||||
|
for more than a year.
|
||||||
|
For patents they are based on Gambia Lens (https://www.lens.org/) -and
|
||||||
|
they do have links from patents to pubs-. Unfortunately for some reason,
|
||||||
|
Gambia uploaded data only once and then stopped. They will talk to them to
|
||||||
|
see what has happened.
|
||||||
|
In every case, I think that such API is very useful both for D4I and OA,
|
||||||
|
and we should have a look and possibly integrate such data the soonest
|
||||||
|
possible.
|
||||||
|
|
||||||
|
All the best,
|
||||||
|
|
||||||
|
Omiros
|
|
@ -0,0 +1,32 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
pmcid AS "id",
|
||||||
|
'pmcid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
pmid AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
doi AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
d_b_id AS "id",
|
||||||
|
'drug_bank_id' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,36 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmcid AS "docId",
|
||||||
|
'pmcid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmid AS "docId",
|
||||||
|
'pmid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
doi AS "docId",
|
||||||
|
'doi' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
d_b_id AS "docId",
|
||||||
|
'drug_bank_id' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,6 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT distinct
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
|
||||||
|
'MOCK PROJECT' AS "title",
|
||||||
|
funder AS "funder"
|
||||||
|
FROM data
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,10 @@
|
||||||
|
CREATE TABLE data (
|
||||||
|
id text,
|
||||||
|
d_b_id text,
|
||||||
|
doi text,
|
||||||
|
pmcid text,
|
||||||
|
pmid text,
|
||||||
|
drug_substance text,
|
||||||
|
funder text,
|
||||||
|
section_of_drug_bank_entry_where_citation_occured text
|
||||||
|
)
|
|
@ -0,0 +1,52 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
excelFile="../../orig/drug_bank_database/Publication_citations_in_Drug_Bank_database.xlsx"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
workdir=/tmp/drugbank
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Links from drugbank db Import:"
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv file"
|
||||||
|
csv="$workdir/drugbank.csv"
|
||||||
|
xlsx2csv -c UTF-8 "$excelFile" > $csv
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the drugbank database"
|
||||||
|
dbname=drugbank
|
||||||
|
|
||||||
|
dropdb $dbname --if-exists;
|
||||||
|
createdb $dbname;
|
||||||
|
psql $dbname -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csv" ]]; then
|
||||||
|
echo " - Importing data: $csv"
|
||||||
|
psql $dbname -c "COPY data(id,d_b_id,doi,pmcid,pmid,drug_substance,funder,section_of_drug_bank_entry_where_citation_occured) FROM '$csv' CSV HEADER;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csv"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
echo " - Fix funder names"
|
||||||
|
|
||||||
|
psql $dbname -c "UPDATE data SET funder='EC' WHERE funder = 'European Research Council'"
|
||||||
|
psql $dbname -c "UPDATE data SET funder='Austrian Science Fund FWF' WHERE funder = 'FWF'"
|
||||||
|
psql $dbname -c "UPDATE data SET funder='Swiss National Science Foundation SNSF' WHERE funder = 'Swiss National Science Foundation'"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/drug_bank_database/*.json
|
||||||
|
psql $dbname -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/project.json
|
||||||
|
psql $dbname -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/docotherid.json
|
||||||
|
psql $dbname -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database/projectdocotherid.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
|
@ -0,0 +1,32 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
pmcid AS "id",
|
||||||
|
'pmcid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
pmid AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
doi AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
d_b_id AS "id",
|
||||||
|
'drug_bank_id' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none'
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,36 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
|
||||||
|
pmcid AS "docId",
|
||||||
|
'pmcid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != '' AND pmcid not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
|
||||||
|
pmid AS "docId",
|
||||||
|
'pmid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != '' AND pmid not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
|
||||||
|
doi AS "docId",
|
||||||
|
'doi' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != '' AND doi not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|corda_______::'||MD5(ec_project_code) AS "projectId",
|
||||||
|
d_b_id AS "docId",
|
||||||
|
'drug_bank_id' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE d_b_id IS NOT NULL AND d_b_id != '' AND d_b_id not ilike 'none' AND ec_project_code not ilike 'unknown' AND funding_scheme ilike 'FP7%'
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,11 @@
|
||||||
|
CREATE TABLE data (
|
||||||
|
doi text,
|
||||||
|
d_b_id text,
|
||||||
|
pmcid text,
|
||||||
|
pmid text,
|
||||||
|
drug_substance text,
|
||||||
|
ec_project_acronym text,
|
||||||
|
ec_project_code text,
|
||||||
|
funding_scheme text,
|
||||||
|
match_type text
|
||||||
|
);
|
|
@ -0,0 +1,39 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
excelFile="../../orig/drug_bank_database/DB_Publication_project_links.xlsx"
|
||||||
|
|
||||||
|
workdir=/tmp/drugbank_part2
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Links from drugbank db Import:"
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv file"
|
||||||
|
csv="$workdir/drugbank.csv"
|
||||||
|
xlsx2csv -c UTF-8 "$excelFile" > $csv
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the drugbank database"
|
||||||
|
dbname=drugbank_p2
|
||||||
|
|
||||||
|
dropdb $dbname --if-exists;
|
||||||
|
createdb $dbname;
|
||||||
|
psql $dbname -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csv" ]]; then
|
||||||
|
echo " - Importing data: $csv"
|
||||||
|
psql $dbname -c "COPY data(doi,d_b_id,pmcid,pmid,drug_substance,ec_project_acronym,ec_project_code,funding_scheme,match_type) FROM '$csv' CSV HEADER;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csv"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/drug_bank_database/*.json
|
||||||
|
psql $dbname -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database_part2/docotherid.json
|
||||||
|
psql $dbname -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/drug_bank_database_part2/projectdocotherid.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
|
@ -0,0 +1,56 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
urlOrgFp7=http://cordis.europa.eu/data/cordis-fp7organizations.xlsx
|
||||||
|
urlOrgH2020=http://cordis.europa.eu/data/cordis-h2020organizations.xlsx
|
||||||
|
|
||||||
|
workdir=/tmp/cordis
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "cordis Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Downloading files"
|
||||||
|
wget "$urlOrgFp7" -O "$workdir/fp7orgs.xlsx" -q --show-progress
|
||||||
|
wget "$urlOrgH2020" -O "$workdir/h2020orgs.xlsx" -q --show-progress
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv files"
|
||||||
|
csvfp7="$workdir/fp7orgs.csv"
|
||||||
|
csvh2020="$workdir/h2020orgs.csv"
|
||||||
|
xlsx2csv -c UTF-8 "$workdir/fp7orgs.xlsx" > $csvfp7
|
||||||
|
xlsx2csv -c UTF-8 "$workdir/h2020orgs.xlsx" > $csvh2020
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the cordis database"
|
||||||
|
dropdb cordis --if-exists;
|
||||||
|
createdb cordis;
|
||||||
|
psql cordis -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csvfp7" ]]; then
|
||||||
|
echo " - Importing FP7 participants: $csvfp7"
|
||||||
|
psql cordis -c "COPY participants(projectrcn,projectiD,projectacronym,role,orgid,orgname,orgshortname,activitytype,endofparticipation,eccontribution,country,street,city,postCode,organizationurl,vatnumber,contactform,contacttype,contacttitle,contactfirstnames,contactlastnames,contactfunction,contacttelephonenumber,contactfaxnumber) FROM '$csvfp7' CSV HEADER;"
|
||||||
|
psql cordis -c "UPDATE participants SET fundingprogram='FP7' WHERE fundingprogram IS NULL"
|
||||||
|
else
|
||||||
|
echo " - Invalid file fp7: $csvfp7"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$csvh2020" ]]; then
|
||||||
|
echo " - Importing H2020 participants: $csvh2020"
|
||||||
|
psql cordis -c "COPY participants(projectrcn,projectiD,projectacronym,role,orgid,orgname,orgshortname,activitytype,endofparticipation,eccontribution,country,street,city,postCode,organizationurl,vatnumber,contactform,contacttype,contacttitle,contactfirstnames,contactlastnames,contactfunction,contacttelephonenumber,contactfaxnumber) FROM '$csvh2020' CSV HEADER;"
|
||||||
|
psql cordis -c "UPDATE participants SET fundingprogram='H2020' WHERE fundingprogram IS NULL"
|
||||||
|
else
|
||||||
|
echo " - Invalid file h2020: $csvh2020"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/cordis/*.json
|
||||||
|
psql cordis -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/project.json
|
||||||
|
psql cordis -f orgs2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/organization.json
|
||||||
|
psql cordis -f projOrg2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/projectOrganization.json
|
||||||
|
psql cordis -f projOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/projectOtherId.json
|
||||||
|
psql cordis -f orgOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/cordis/organizationOtherId.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'20|ec__________::'||MD5(orgid) AS "orgId",
|
||||||
|
orgid AS "id",
|
||||||
|
'ec:PIC' AS "type"
|
||||||
|
FROM participants
|
||||||
|
WHERE orgid IS NOT NULL
|
||||||
|
GROUP BY orgid
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,13 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'20|ec__________::'||MD5(orgid) AS "id",
|
||||||
|
MAX(orgname) AS "name",
|
||||||
|
MAX(orgshortname) AS "shortName",
|
||||||
|
MAX(country) AS "country",
|
||||||
|
MAX(street) AS "street",
|
||||||
|
MAX(city) AS "city",
|
||||||
|
MAX(postcode) AS "postCode",
|
||||||
|
MAX(organizationurl) AS "url"
|
||||||
|
FROM participants
|
||||||
|
WHERE orgid IS NOT NULL
|
||||||
|
GROUP BY orgid
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,26 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
|
||||||
|
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
|
||||||
|
ELSE '40|unknown_____::'||MD5(projectid)
|
||||||
|
END AS "projectId",
|
||||||
|
'20|ec__________::'||MD5(orgid) AS "orgId",
|
||||||
|
MAX(role) AS "role",
|
||||||
|
MAX(activitytype) AS "activityType",
|
||||||
|
MAX(endofparticipation) AS "endOfParticipation",
|
||||||
|
MAX(eccontribution) AS "ecContribution",
|
||||||
|
MAX(contacttype) AS "contactType",
|
||||||
|
MAX(contacttitle) AS "contactTitle",
|
||||||
|
MAX(contactfirstnames) AS "contactFirstNames",
|
||||||
|
MAX(contactlastnames) AS "contactLastNames",
|
||||||
|
MAX(contactfunction) AS "contactFunction",
|
||||||
|
MAX(contacttelephonenumber) AS "contactTelephoneNumber",
|
||||||
|
MAX(contactfaxnumber) AS "contactFaxNumber",
|
||||||
|
MAX(contactform) AS "contactForm"
|
||||||
|
FROM participants
|
||||||
|
WHERE orgid IS NOT NULL AND projectid IS NOT NULL
|
||||||
|
GROUP BY orgid, projectid, fundingprogram
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
|
||||||
|
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
|
||||||
|
ELSE '40|unknown_____::'||MD5(projectid)
|
||||||
|
END AS "projectId",
|
||||||
|
projectid AS "id",
|
||||||
|
'ec:grant_id' AS "type"
|
||||||
|
FROM participants
|
||||||
|
WHERE projectid IS NOT NULL
|
||||||
|
GROUP BY projectid, fundingprogram
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
|
||||||
|
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
|
||||||
|
ELSE '40|unknown_____::'||MD5(projectid)
|
||||||
|
END AS "projectId",
|
||||||
|
MAX(projectrcn) AS "id",
|
||||||
|
'ec:RCN' AS "type"
|
||||||
|
FROM participants
|
||||||
|
WHERE projectid IS NOT NULL AND projectrcn IS NOT NULL
|
||||||
|
GROUP BY projectid, fundingprogram
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,16 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN fundingprogram='FP7' THEN '40|corda_______::'||MD5(projectid)
|
||||||
|
WHEN fundingprogram='H2020' THEN '40|corda__h2020::'||MD5(projectid)
|
||||||
|
ELSE '40|unknown_____::'||MD5(projectid)
|
||||||
|
END AS "id",
|
||||||
|
MAX(projectacronym) AS "acronym",
|
||||||
|
'EC' AS "funder",
|
||||||
|
fundingprogram AS "fundingLevel0"
|
||||||
|
FROM participants
|
||||||
|
WHERE projectid IS NOT NULL
|
||||||
|
GROUP BY
|
||||||
|
projectid,
|
||||||
|
fundingprogram
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,28 @@
|
||||||
|
CREATE TABLE participants (
|
||||||
|
projectrcn text,
|
||||||
|
projectid text,
|
||||||
|
projectacronym text,
|
||||||
|
role text,
|
||||||
|
orgid text,
|
||||||
|
orgname text,
|
||||||
|
orgshortname text,
|
||||||
|
activitytype text,
|
||||||
|
endofparticipation text,
|
||||||
|
eccontribution text,
|
||||||
|
country text,
|
||||||
|
street text,
|
||||||
|
city text,
|
||||||
|
postcode text,
|
||||||
|
organizationurl text,
|
||||||
|
vatnumber text,
|
||||||
|
contacttype text,
|
||||||
|
contacttitle text,
|
||||||
|
contactfirstnames text,
|
||||||
|
contactlastnames text,
|
||||||
|
contactfunction text,
|
||||||
|
contacttelephonenumber text,
|
||||||
|
contactfaxnumber text,
|
||||||
|
contactform text,
|
||||||
|
fundingprogram varchar(10)
|
||||||
|
);
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
The script should be launched in OpenAIRE production server (services.openaire.eu)
|
|
@ -0,0 +1,16 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
BASEDIR=/tmp/ecProjectsOpenaire
|
||||||
|
|
||||||
|
echo "Saving files in $BASEDIR ..."
|
||||||
|
|
||||||
|
rm -rf $BASEDIR
|
||||||
|
mkdir $BASEDIR
|
||||||
|
|
||||||
|
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projects2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/project.json
|
||||||
|
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f orgs2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/organization.json
|
||||||
|
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projOrg2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/projectOrganization.json
|
||||||
|
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f orgOtherId2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/organizationOtherId.json
|
||||||
|
psql -h postgresql.services.openaire.eu -U dnet dnet_openaireplus -f projOtherId2json.sql | sed 's/\\\\/\\/g' > $BASEDIR/projectOtherId.json
|
||||||
|
|
||||||
|
echo Done.
|
|
@ -0,0 +1,8 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'20|ec__________::'||MD5(substring(id from 15)) AS "orgId",
|
||||||
|
substring(id from 15) AS "id",
|
||||||
|
'ec:PIC' AS "type"
|
||||||
|
FROM dsm_organizations
|
||||||
|
WHERE id LIKE 'corda%'
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,28 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'20|ec__________::'||MD5(substring(o.id from 15)) AS "id",
|
||||||
|
o.legalname AS "name",
|
||||||
|
o.legalshortname AS "shortName",
|
||||||
|
o.country AS "country",
|
||||||
|
o.websiteurl AS "url",
|
||||||
|
o.ec_legalbody AS "ecLegalBody",
|
||||||
|
o.ec_legalperson AS "ecLegalPerson",
|
||||||
|
o.ec_nonprofit AS "ecNonProfit",
|
||||||
|
o.ec_researchorganization AS "ecResearchOrganization",
|
||||||
|
o.ec_highereducation AS "ecHigherEducation",
|
||||||
|
o.ec_internationalorganizationeurinterests AS "ecInternationalOrganizationEurInterests",
|
||||||
|
o.ec_internationalorganization AS "ecInternationalOrganization",
|
||||||
|
o.ec_enterprise AS "ecEnterprise",
|
||||||
|
o.ec_smevalidated AS "ecSmeValidated",
|
||||||
|
o.ec_nutscode AS "ecNutsCode"
|
||||||
|
FROM
|
||||||
|
dsm_organizations o
|
||||||
|
LEFT OUTER JOIN project_organization po ON (po.resporganization = o.id)
|
||||||
|
WHERE
|
||||||
|
o.id LIKE 'corda%'
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'40|'||substring(project from 1 for 12)||'::'||MD5(substring(project from 15)) AS "projectId",
|
||||||
|
'20|ec__________::'||MD5(substring(resporganization from 15)) AS "orgId",
|
||||||
|
semanticclass AS "role"
|
||||||
|
FROM
|
||||||
|
project_organization
|
||||||
|
WHERE
|
||||||
|
project LIKE 'corda%'
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|'||substring(id from 1 for 12)||'::'||MD5(substring(id from 15)) AS "projectId",
|
||||||
|
code AS "id",
|
||||||
|
'ec:grant_id' AS "type"
|
||||||
|
FROM projects
|
||||||
|
WHERE id LIKE 'corda%'
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,23 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'40|'||substring(p.id from 1 for 12)||'::'||MD5(substring(p.id from 15)) AS "id",
|
||||||
|
p.title AS "title",
|
||||||
|
p.acronym AS "acronym",
|
||||||
|
p.call_identifier AS "callId",
|
||||||
|
split_part(pf.funding, '::', 2) AS "funder",
|
||||||
|
split_part(pf.funding, '::', 3) AS "fundingLevel0",
|
||||||
|
split_part(pf.funding, '::', 4) AS "fundingLevel1",
|
||||||
|
split_part(pf.funding, '::', 5) AS "fundingLevel2",
|
||||||
|
p.startdate AS "startDate",
|
||||||
|
p.enddate AS "endDate",
|
||||||
|
p.websiteurl AS "websiteUrl",
|
||||||
|
p.keywords AS "keywords",
|
||||||
|
p.contracttypescheme||':'||p.contracttypeclass AS "contractType",
|
||||||
|
p.ec_sc39 AS "ecSc39",
|
||||||
|
p.oa_mandate_for_publications AS "oaMandateForPublications",
|
||||||
|
p.ec_article29_3 AS "ecArticle29_3"
|
||||||
|
FROM
|
||||||
|
projects p
|
||||||
|
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||||
|
WHERE
|
||||||
|
p.id LIKE 'corda%'
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,24 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
pmcid AS "id",
|
||||||
|
'pmcid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
pmid AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
doi AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != ''
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,51 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
excelFile="../../orig/fundersData/Funders, DOIS 31122018.xlsx"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
workdir=/tmp/funderData
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Funder Data Import:"
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv file"
|
||||||
|
csv="$workdir/funderdata.csv"
|
||||||
|
xlsx2csv -c UTF-8 "$excelFile" > $csv
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the funderdata database"
|
||||||
|
dropdb funderdata --if-exists;
|
||||||
|
createdb funderdata;
|
||||||
|
psql funderdata -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csv" ]]; then
|
||||||
|
echo " - Importing data: $csv"
|
||||||
|
psql funderdata -c "COPY data(funder,pmcid,pmid,source,doi) FROM '$csv' CSV HEADER;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csv"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
echo " - Fix funder names"
|
||||||
|
psql funderdata -c "UPDATE data SET funder='EC' WHERE funder = 'Marie Curie'"
|
||||||
|
psql funderdata -c "UPDATE data SET funder='EC' WHERE funder = 'European Research Council'"
|
||||||
|
psql funderdata -c "UPDATE data SET funder='Breast Cancer Now' WHERE funder = 'BreastCancerNow'"
|
||||||
|
psql funderdata -c "UPDATE data SET funder='Wellcome Trust' WHERE funder = 'Wellcome Trust/DBT India Alliance'"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/funderdata/*.json
|
||||||
|
psql funderdata -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/project.json
|
||||||
|
psql funderdata -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/docotherid.json
|
||||||
|
psql funderdata -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/funderdata/projectdocotherid.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
|
@ -0,0 +1,26 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmcid AS "docId",
|
||||||
|
'pmcid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmid AS "docId",
|
||||||
|
'pmid' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
doi AS "docId",
|
||||||
|
'doi' AS "docIdType"
|
||||||
|
FROM data
|
||||||
|
WHERE doi IS NOT NULL AND doi != ''
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,6 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT distinct
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
|
||||||
|
'MOCK PROJECT' AS "title",
|
||||||
|
funder AS "funder"
|
||||||
|
FROM data
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,8 @@
|
||||||
|
CREATE TABLE data (
|
||||||
|
funder text,
|
||||||
|
pmcid text,
|
||||||
|
pmid text,
|
||||||
|
source text,
|
||||||
|
doi text
|
||||||
|
);
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
Dear Claudio, Please find the json file containing the clinical guideline base data attached.
|
||||||
|
|
||||||
|
It is formatted as follows:
|
||||||
|
|
||||||
|
LocalID [Our local guideline ID]
|
||||||
|
Type "guideline"
|
||||||
|
Title [Title of guideline]
|
||||||
|
PubYear [Guideline publication year]
|
||||||
|
Originator [Organization that created the guideline (subset of ProviderCollection)]
|
||||||
|
ProviderCollection [Collection name]
|
||||||
|
Abstract [Guideline abstract (from PubMed, if available (only from WHO, NICE and Cochrane))]
|
||||||
|
PMID [PMID if available]
|
||||||
|
DOI [DOI if available]
|
||||||
|
PMCID [PMCID if available]
|
||||||
|
MatchedReferences: [references matched with Our set of publications as PMID (as well as PMCID and funder name)]
|
||||||
|
[All]References: [All references in each guideline]
|
||||||
|
|
||||||
|
We also have the full text for WHO, NICE and Cochrane, as well as the PDF:s for the German AWMF guidelines, but it is still uncertain how this material could be shared due to copyright issues.
|
||||||
|
|
||||||
|
|
||||||
|
FILE: /data/d4i/guidelines.json.zip
|
|
@ -0,0 +1,5 @@
|
||||||
|
Al momento non sono gestite le MatchedReferences.
|
||||||
|
|
||||||
|
Il modello attuale prevede la tabella Citation per mmetere in relazione due documenti.
|
||||||
|
|
||||||
|
Forse deve essere rivisto.
|
|
@ -0,0 +1,39 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(gid) AS "docId1",
|
||||||
|
pmcid AS "docId2",
|
||||||
|
'pmcid' AS "docId2Type",
|
||||||
|
'guidelines_matched' AS "relType"
|
||||||
|
FROM relations
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(gid) AS "docId1",
|
||||||
|
pmid AS "docId2",
|
||||||
|
'pmid' AS "docId2Type",
|
||||||
|
'guidelines_matched' AS "relType"
|
||||||
|
FROM relations
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(gid) AS "docId1",
|
||||||
|
doi AS "docId2",
|
||||||
|
'doi' AS "docId2Type",
|
||||||
|
'guidelines_matched' AS "relType"
|
||||||
|
FROM relations
|
||||||
|
WHERE doi IS NOT NULL AND doi != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(gid) AS "docId1",
|
||||||
|
rel AS "docId2",
|
||||||
|
'pmid' AS "docId2Type",
|
||||||
|
'guidelines_all' AS "relType"
|
||||||
|
FROM allrefs
|
||||||
|
WHERE rel IS NOT NULL AND rel != ''
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,72 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(id) AS "docId",
|
||||||
|
id AS "id",
|
||||||
|
'guidelineLocalID' AS "type"
|
||||||
|
FROM guidelines
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(id) AS "docId",
|
||||||
|
pmcid AS "id",
|
||||||
|
'pmcid' AS "type"
|
||||||
|
FROM guidelines
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(id) AS "docId",
|
||||||
|
pmid AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM guidelines
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(id) AS "docId",
|
||||||
|
doi AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM guidelines
|
||||||
|
WHERE doi IS NOT NULL AND doi != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
NULL AS "docId",
|
||||||
|
pmcid AS "id",
|
||||||
|
'pmcid' AS "type"
|
||||||
|
FROM relations
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
NULL AS "docId",
|
||||||
|
pmid AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM relations
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
NULL AS "docId",
|
||||||
|
doi AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM relations
|
||||||
|
WHERE pmid IS NOT NULL AND doi != ''
|
||||||
|
|
||||||
|
UNION
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
NULL AS "docId",
|
||||||
|
rel AS "id",
|
||||||
|
'pmid' AS "type"
|
||||||
|
FROM allrefs
|
||||||
|
WHERE rel IS NOT NULL AND rel != ''
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,12 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'50|guidelines__::'||MD5(g.id) AS "id",
|
||||||
|
g.title AS "title",
|
||||||
|
g.abstract AS "abstractText",
|
||||||
|
g.gtype AS "type",
|
||||||
|
g.year AS "pubYear",
|
||||||
|
g.orig AS "repository",
|
||||||
|
g.collection AS "collection"
|
||||||
|
FROM
|
||||||
|
guidelines g
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,67 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
#detailsFile=../../orig/guidelines/guidelines.json
|
||||||
|
detailsFile=/tmp/guidelines.json
|
||||||
|
|
||||||
|
workdir=/tmp/guidelines
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Guidelines Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv files"
|
||||||
|
csvGuidelines="$workdir/guidelines.csv"
|
||||||
|
csvRels="$workdir/rels.csv"
|
||||||
|
csvAllRels="$workdir/allRels.csv"
|
||||||
|
|
||||||
|
cat $detailsFile | jq 'map([.LocalID, .Type, .Title, .PubYear, .Originator, .ProviderCollection, .Abstract, .PMID, .DOI, .PMCID])' | jq .[] | jq -r @csv > $csvGuidelines
|
||||||
|
cat $detailsFile | jq -r '.[] | .LocalID as $id | (.MatchedReferences | map([$id, (.PMID + ""), (.PMCID + ""), (.DOI + "") , ( .Funders | map(.+"#") | add | . + "" ) ]) )[] | @csv' > $csvRels
|
||||||
|
cat $detailsFile | jq -r '.[] | .LocalID as $id | (.AllReferences | map([$id, .]) )[] | @csv' > $csvAllRels
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the guidelines database"
|
||||||
|
dropdb guidelines --if-exists;
|
||||||
|
createdb guidelines;
|
||||||
|
psql guidelines -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csvGuidelines" ]]; then
|
||||||
|
echo " - Importing guidelines: $csvGuidelines"
|
||||||
|
psql guidelines -c "COPY guidelines(id, gtype, title, year, orig, collection, abstract, pmid, doi, pmcid) FROM '$csvGuidelines' CSV;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csvGuidelines"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$csvRels" ]]; then
|
||||||
|
echo " - Importing rels: $csvRels"
|
||||||
|
psql guidelines -c "COPY relations(gid, pmid, pmcid, doi, funder) FROM '$csvRels' CSV;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csvRels"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$csvAllRels" ]]; then
|
||||||
|
echo " - Importing all rels: $csvAllRels"
|
||||||
|
psql guidelines -c "COPY allrefs(gid, rel) FROM '$csvAllRels' CSV;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csvAllRels"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/guidelines/*.json
|
||||||
|
psql guidelines -f document2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/document.json
|
||||||
|
psql guidelines -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/docotherid.json
|
||||||
|
psql guidelines -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/project.json
|
||||||
|
psql guidelines -f projDocOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/projectdocotherid.json
|
||||||
|
psql guidelines -f docDocumentOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/guidelines/docDocumentOtherId.json
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Importing final files"
|
||||||
|
cd ../../jsonfiles/guidelines
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmcid AS "docId",
|
||||||
|
'pmcid' AS "docIdType"
|
||||||
|
FROM (select * from (select pmid, pmcid, unnest(string_to_array(funder, '#')) as funder from relations) as t where length(t.funder) > 0) r
|
||||||
|
WHERE pmcid IS NOT NULL AND pmcid != ''
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "projectId",
|
||||||
|
pmid AS "docId",
|
||||||
|
'pmid' AS "docIdType"
|
||||||
|
FROM (select * from (select pmid, pmcid, unnest(string_to_array(funder, '#')) as funder from relations) as t where length(t.funder) > 0) r
|
||||||
|
WHERE pmid IS NOT NULL AND pmid != ''
|
||||||
|
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT distinct
|
||||||
|
'40|MOCK_PROJECT::'||MD5(funder) AS "id",
|
||||||
|
'MOCK PROJECT' AS "title",
|
||||||
|
funder AS "funder"
|
||||||
|
FROM
|
||||||
|
(SELECT DISTINCT unnest(string_to_array(funder, '#')) AS funder FROM relations ) r WHERE LENGTH(r.funder) > 0
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,25 @@
|
||||||
|
CREATE TABLE guidelines (
|
||||||
|
id text,
|
||||||
|
gtype text,
|
||||||
|
title text,
|
||||||
|
year text,
|
||||||
|
orig text,
|
||||||
|
collection text,
|
||||||
|
abstract text,
|
||||||
|
pmid text,
|
||||||
|
doi text,
|
||||||
|
pmcid text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE relations (
|
||||||
|
gid text,
|
||||||
|
pmid text,
|
||||||
|
pmcid text,
|
||||||
|
doi text,
|
||||||
|
funder text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE allrefs (
|
||||||
|
gid text,
|
||||||
|
rel text
|
||||||
|
);
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
|
|
||||||
|
table=network_analysis_metrics
|
||||||
|
|
||||||
|
echo "Recrreating table $table"
|
||||||
|
psql -h localhost -U d4i data4impact -c "DROP TABLE IF EXISTS $table;"
|
||||||
|
psql -h localhost -U d4i data4impact -c "CREATE TABLE $table (betweenness_centrality double precision,closeness_centrality double precision,degree_centrality double precision,eccentricity_centrality double precision,eigenvector_centrality double precision,farness_centrality double precision,pic text,name text, icd int, period text, orgid text);"
|
||||||
|
echo
|
||||||
|
|
||||||
|
for icd in {1..19}
|
||||||
|
do
|
||||||
|
if [ -d "$DIR/$icd" ]; then
|
||||||
|
cd "$DIR/$icd"
|
||||||
|
for csv in *.csv
|
||||||
|
do
|
||||||
|
y1=$(echo $csv | cut -c1-4)
|
||||||
|
y2=$(expr $y1 + 1)
|
||||||
|
period="$y1-$y2"
|
||||||
|
|
||||||
|
echo "Processing file $DIR/$icd/$csv..."
|
||||||
|
|
||||||
|
if grep --quiet eccentricity_centrality "$DIR/$icd/$csv"; then
|
||||||
|
psql -h localhost -U d4i data4impact -c "COPY $table (betweenness_centrality,closeness_centrality,degree_centrality,eccentricity_centrality,eigenvector_centrality,farness_centrality,pic,name) FROM '$DIR/$icd/$csv' CSV HEADER;"
|
||||||
|
else
|
||||||
|
psql -h localhost -U d4i data4impact -c "COPY $table (betweenness_centrality,closeness_centrality,degree_centrality,eigenvector_centrality,farness_centrality,pic,name) FROM '$DIR/$icd/$csv' CSV HEADER;"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
psql -h localhost -U d4i data4impact -c "UPDATE $table SET (icd,period) = ($icd,'$period') WHERE icd IS NULL;"
|
||||||
|
echo;
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
echo "Fixing values..."
|
||||||
|
psql -h localhost -U d4i data4impact -c "UPDATE $table SET pic = replace(pic, '.0', '') WHERE pic IS NOT NULL;"
|
||||||
|
psql -h localhost -U d4i data4impact -c "UPDATE $table SET orgid = '20|ec__________::'||MD5(pic) WHERE pic IS NOT NULL;"
|
||||||
|
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
echo
|
||||||
|
|
|
@ -0,0 +1,211 @@
|
||||||
|
# MANUAL STEPS FOR news-blogs-forum
|
||||||
|
|
||||||
|
1) cd /data/ftp/d4i/social_data/news-blogs-forum
|
||||||
|
|
||||||
|
2) find *.zip -exec bash -c "unzip -p {} | jq --slurp -r 'map([.content,.actor,.topicId,.mediatype,.source,.headline,.url,.dt,.language,.country]) | .[] | @csv'" \; | sed 's/\x00//g' > data4impact_corpus_allmedia.csv
|
||||||
|
|
||||||
|
3) Recreate the table in the DB using
|
||||||
|
|
||||||
|
DROP TABLE socialdata;
|
||||||
|
DROP SEQUENCE socialdata_serial;
|
||||||
|
|
||||||
|
CREATE SEQUENCE socialdata_serial START 1;
|
||||||
|
|
||||||
|
CREATE TABLE socialdata (
|
||||||
|
id text PRIMARY KEY DEFAULT '51|social__data::'||MD5(nextval('socialdata_serial')::text),
|
||||||
|
content text,
|
||||||
|
actor text,
|
||||||
|
topicId text,
|
||||||
|
mediatype text,
|
||||||
|
source text,
|
||||||
|
headline text,
|
||||||
|
url text,
|
||||||
|
dt text,
|
||||||
|
language text,
|
||||||
|
country text
|
||||||
|
);
|
||||||
|
|
||||||
|
4) Insert data:
|
||||||
|
|
||||||
|
COPY socialdata(content,actor,topicId,mediatype,source,headline,url,dt,language,country) FROM '/data/ftp/d4i/social_data/news-blogs-forum/data4impact_corpus_allmedia.csv' CSV;
|
||||||
|
|
||||||
|
(OPTIONAL) if (error_during_copy) -> perl -pi -e 's/\x00//g' data4impact_corpus_allmedia.csv
|
||||||
|
|
||||||
|
#############################################################################################################################################################
|
||||||
|
|
||||||
|
# MANUAL STEPS FOR twitter (Buzz)
|
||||||
|
|
||||||
|
1) cd "/data/ftp/d4i/social_data/twitter/Buzz JSON Feb"
|
||||||
|
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.rtid,.rtDate,.date,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter_buzz.csv
|
||||||
|
3) recreate the table
|
||||||
|
|
||||||
|
DROP TABLE twitterbuzz;
|
||||||
|
DROP SEQUENCE twitterbuzz_serial;
|
||||||
|
|
||||||
|
CREATE SEQUENCE twitterbuzz_serial START 1;
|
||||||
|
|
||||||
|
CREATE TABLE twitterbuzz (
|
||||||
|
id text PRIMARY KEY DEFAULT '52|twitter_buzz::'||MD5(nextval('twitterbuzz_serial')::text),
|
||||||
|
tags text,
|
||||||
|
language text,
|
||||||
|
country text,
|
||||||
|
content text,
|
||||||
|
topicid text,
|
||||||
|
sourcetype text,
|
||||||
|
source text,
|
||||||
|
actor text,
|
||||||
|
rtid text,
|
||||||
|
rtdate text,
|
||||||
|
date text,
|
||||||
|
headline text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitterbuzz_tags(
|
||||||
|
tb_id text REFERENCES twitterbuzz(id),
|
||||||
|
tag text,
|
||||||
|
PRIMARY KEY(tb_id, tag)
|
||||||
|
);
|
||||||
|
|
||||||
|
4) Insert data:
|
||||||
|
|
||||||
|
COPY twitterbuzz(tags,language,country,content,topicid,sourcetype,source,actor,rtid,rtdate,date,headline) FROM '/data/ftp/d4i/social_data/twitter/Buzz JSON Feb/twitter_buzz.csv' CSV;
|
||||||
|
|
||||||
|
5) patch data:
|
||||||
|
UPDATE twitterbuzz SET tags = '' WHERE tags = 'null';
|
||||||
|
UPDATE twitterbuzz SET tags = replace(tags, ',,', ',') WHERE tags LIKE '%,,%';
|
||||||
|
|
||||||
|
insert into twitterbuzz_tags(tb_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitterbuzz) as t where tag != '';
|
||||||
|
alter table twitterbuzz drop column tags;
|
||||||
|
|
||||||
|
|
||||||
|
#############################################################################################################################################################
|
||||||
|
|
||||||
|
# MANUAL STEPS FOR twitter
|
||||||
|
|
||||||
|
1) cd "/data/ftp/d4i/social_data/twitter/Corrected JSON"
|
||||||
|
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.retweetedActor,(.urls | join("§")),.datetime,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter.csv
|
||||||
|
|
||||||
|
DROP TABLE twitter;
|
||||||
|
DROP SEQUENCE twitter_serial;
|
||||||
|
|
||||||
|
CREATE SEQUENCE twitter_serial START 1;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE twitter (
|
||||||
|
id text PRIMARY KEY DEFAULT '52|twitter_____::'||MD5(nextval('twitter_serial')::text),
|
||||||
|
tags text,
|
||||||
|
language text,
|
||||||
|
country text,
|
||||||
|
content text,
|
||||||
|
topicid text,
|
||||||
|
sourcetype text,
|
||||||
|
source text,
|
||||||
|
actor text,
|
||||||
|
retweetedactor text,
|
||||||
|
urls text,
|
||||||
|
datetime timestamp,
|
||||||
|
headline text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitter_tags(
|
||||||
|
t_id text REFERENCES twitter(id),
|
||||||
|
tag text,
|
||||||
|
PRIMARY KEY(t_id, tag)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitter_urls(
|
||||||
|
t_id text REFERENCES twitter(id),
|
||||||
|
url text,
|
||||||
|
PRIMARY KEY(t_id, url)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
4) Insert data:
|
||||||
|
|
||||||
|
COPY twitter(tags,language,country,content,topicid,sourcetype,source,actor,retweetedactor,urls,datetime,headline) FROM '/data/ftp/d4i/social_data/twitter/Corrected JSON/twitter.csv' CSV;
|
||||||
|
|
||||||
|
5) patch data:
|
||||||
|
insert into twitter_tags(t_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitter) as t where tag != '' and tag != 'null';
|
||||||
|
insert into twitter_urls(t_id, url) select distinct * from (select id, regexp_split_to_table(urls, '§') as url from twitter) as t where url != '' and url != 'null';
|
||||||
|
alter table twitter drop column tags;
|
||||||
|
alter table twitter drop column urls;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#############################################################################################################################################################
|
||||||
|
|
||||||
|
# MANUAL STEPS FOR twitter_threads
|
||||||
|
|
||||||
|
1) cd "/data/ftp/d4i/social_data/twitter/Threads"
|
||||||
|
2)
|
||||||
|
|
||||||
|
jq -r 'map([.threadId, .length, .velocity, .participants, .startId, .startTime, .endTime]) | .[] | @csv' twitter_threads_metadata.json > twitter_threads_metadata.csv
|
||||||
|
rm twitter_threads_tweets.csv
|
||||||
|
|
||||||
|
# jq-1.6 is required
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_doi.json >> twitter_threads_tweets.csv
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q1.json >> twitter_threads_tweets.csv
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q2.json >> twitter_threads_tweets.csv
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q3.json >> twitter_threads_tweets.csv
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q4.json >> twitter_threads_tweets.csv
|
||||||
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q5.json >> twitter_threads_tweets.csv
|
||||||
|
|
||||||
|
3)
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS twitter_threads
|
||||||
|
CREATE TABLE twitter_threads (
|
||||||
|
id text PRIMARY KEY,
|
||||||
|
length int,
|
||||||
|
velocity double precision,
|
||||||
|
participants int,
|
||||||
|
startid int,
|
||||||
|
starttime timestamp,
|
||||||
|
endtime timestamp
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitter_threads_tweets(
|
||||||
|
threadid text REFERENCES twitter_threads(id),
|
||||||
|
tweetid int,
|
||||||
|
fromuser text,
|
||||||
|
touser text,
|
||||||
|
inreplyto int,
|
||||||
|
datetime timestamp,
|
||||||
|
content text,
|
||||||
|
quotedstatus text,
|
||||||
|
mentions text,
|
||||||
|
urls text,
|
||||||
|
PRIMARY KEY (threadid, tweetid)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitter_threads_tweets_mentions(
|
||||||
|
threadid text,
|
||||||
|
tweetid int,
|
||||||
|
mention text,
|
||||||
|
PRIMARY KEY (threadid, tweetid, mention),
|
||||||
|
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE twitter_threads_tweets_urls(
|
||||||
|
threadid text,
|
||||||
|
tweetid int,
|
||||||
|
url text,
|
||||||
|
PRIMARY KEY (threadid, tweetid, url),
|
||||||
|
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
4) Insert data:
|
||||||
|
|
||||||
|
COPY twitter_threads(id, length, velocity, participants, startid, starttime, endtime) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_metadata.csv' CSV;
|
||||||
|
COPY twitter_threads_tweets(threadid, tweetid, fromuser, touser, inreplyto, datetime, content, quotedstatus, mentions, urls) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_tweets.csv' CSV;
|
||||||
|
|
||||||
|
|
||||||
|
5) patch data:
|
||||||
|
|
||||||
|
insert into twitter_threads_tweets_mentions(threadid, tweetid, mention) select distinct * from (select threadid, tweetid, regexp_split_to_table(mentions, '§') as mention from twitter_threads_tweets) as t where mention != '' and mention != 'null';
|
||||||
|
insert into twitter_threads_tweets_urls (threadid, tweetid, url) select distinct * from (select threadid, tweetid, regexp_split_to_table(urls, '§') as url from twitter_threads_tweets) as t where url != '' and url != 'null';
|
||||||
|
alter table twitter_threads_tweets drop column mentions;
|
||||||
|
alter table twitter_threads_tweets drop column urls;
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
excelFile="../../orig/patents/FP7_patents_full_list_Except_for_ICT.xlsx"
|
||||||
|
|
||||||
|
workdir=/tmp/patentsExcel
|
||||||
|
rm -rf "$workdir" && mkdir "$workdir"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Patents Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating csv file"
|
||||||
|
csv="$workdir/patents.csv"
|
||||||
|
xlsx2csv -c UTF-8 "$excelFile" > $csv
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the patents_excel database"
|
||||||
|
dropdb patents_excel --if-exists;
|
||||||
|
createdb patents_excel;
|
||||||
|
psql patents_excel -f schema.sql
|
||||||
|
|
||||||
|
if [[ -f "$csv" ]]; then
|
||||||
|
echo " - Importing data: $csv"
|
||||||
|
psql patents_excel -c "COPY data(pat_id,type_ip,appnum,appnt,title,pat_url,pat_ref,pat_auth,pat_num,pat_kind,note,appln_id,appln_title_patstat,priority_year,var15,projectid) FROM '$csv' CSV HEADER;"
|
||||||
|
else
|
||||||
|
echo " - Invalid file: $csv"
|
||||||
|
fi
|
||||||
|
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW document"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW doc_other_identifier"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW doc_project"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/patents_excel/*.json
|
||||||
|
|
||||||
|
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM document ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/document.json
|
||||||
|
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_other_identifier) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_other_identifier.json
|
||||||
|
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_project.json
|
||||||
|
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
|
@ -0,0 +1,42 @@
|
||||||
|
CREATE TABLE data(
|
||||||
|
pat_id text,
|
||||||
|
type_ip text,
|
||||||
|
appnum text,
|
||||||
|
appnt text,
|
||||||
|
title text,
|
||||||
|
pat_url text,
|
||||||
|
pat_ref text,
|
||||||
|
pat_auth text,
|
||||||
|
pat_num text,
|
||||||
|
pat_kind text,
|
||||||
|
note text,
|
||||||
|
appln_id text,
|
||||||
|
appln_title_patstat text,
|
||||||
|
priority_year text,
|
||||||
|
var15 text,
|
||||||
|
projectid text
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW document AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "id",
|
||||||
|
title AS "title",
|
||||||
|
lower(regexp_replace(type_ip,'s$','')) AS "type",
|
||||||
|
priority_year AS "pubYear",
|
||||||
|
'patent repo'::text AS "repository"
|
||||||
|
FROM data
|
||||||
|
WHERE appln_id IS NOT NULL AND trim(appln_id) != '';
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW doc_other_identifier AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "docId",
|
||||||
|
trim(appln_id) AS "id",
|
||||||
|
'patent'::text AS "type"
|
||||||
|
FROM data
|
||||||
|
WHERE appln_id IS NOT NULL AND trim(appln_id) != '';
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW doc_project AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(appln_id))) AS "docId",
|
||||||
|
'40|corda_______::'||MD5(lower(trim(projectid))) AS "projectId"
|
||||||
|
FROM data
|
||||||
|
WHERE appln_id IS NOT NULL AND trim(appln_id) != '' AND projectid IS NOT NULL AND trim(projectid) != '';
|
|
@ -0,0 +1,3 @@
|
||||||
|
Patent data are available at ftp://prozac.madgik.di.uoa.gr
|
||||||
|
username: patentdata
|
||||||
|
passwd: d4ipatents
|
|
@ -0,0 +1,48 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#jsonPatents=../../orig/patents/patents.json
|
||||||
|
#jsonFulltexts=../../orig/patents/patents_txt.json
|
||||||
|
|
||||||
|
jsonPatents=../../orig/patents/patents_update.json
|
||||||
|
jsonFulltexts=../../orig/patents/patents_update_txt.json
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Patents Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the patents database"
|
||||||
|
dropdb patents --if-exists
|
||||||
|
createdb patents
|
||||||
|
psql patents -f schema.sql
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
inputJsonPatentsFile="$(cd "$(dirname "$jsonPatents")"; pwd -P)/$(basename "$jsonPatents")"
|
||||||
|
echo " - Importing json $inputJsonPatentsFile"
|
||||||
|
psql patents -c "copy patents_json from '$inputJsonPatentsFile' csv quote e'\x01' delimiter e'\x02'"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
inputJsonFulltextsFile="$(cd "$(dirname "$jsonFulltexts")"; pwd -P)/$(basename "$jsonFulltexts")"
|
||||||
|
echo " - Importing json $jsonFulltexts"
|
||||||
|
psql patents -c "copy patents_text_json from '$inputJsonFulltextsFile' csv quote e'\x01' delimiter e'\x02'"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Refreshing views"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW document"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW doc_fulltext"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW doc_other_identifier"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW project"
|
||||||
|
psql patents -c "REFRESH MATERIALIZED VIEW doc_project"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/patents/*.json
|
||||||
|
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM document ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/document.json
|
||||||
|
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_fulltext ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_fulltext.json
|
||||||
|
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_other_identifier) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_other_identifier.json
|
||||||
|
|
||||||
|
# COMMENT THE FOLLOWING LINES IF THE PATENTS ARE NOT RELATED TO FP7
|
||||||
|
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/project.json
|
||||||
|
psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents/doc_project.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
|
@ -0,0 +1,41 @@
|
||||||
|
CREATE TABLE patents_json (
|
||||||
|
json text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE patents_text_json (
|
||||||
|
json text
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW document AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "id",
|
||||||
|
p->>'Title' AS "title",
|
||||||
|
p->>'Abstract' AS "abstractText",
|
||||||
|
p->>'Type' AS "type",
|
||||||
|
p->>'PubYear' AS "pubYear",
|
||||||
|
'patent repo'::text AS "repository"
|
||||||
|
FROM (SELECT replace(json,'\\','\"')::json AS p FROM patents_json) a;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW doc_fulltext AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
|
||||||
|
trim(p->>'text') AS "fulltext"
|
||||||
|
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_text_json) a
|
||||||
|
WHERE length(trim(p->>'text')) > 0;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW doc_other_identifier AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
|
||||||
|
trim(p->>'LocalID') AS "id",
|
||||||
|
'patent'::text AS "type"
|
||||||
|
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_json) a;
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW project AS SELECT
|
||||||
|
'50|MOCK_PROJECT::'||MD5('EC_FP7')::text AS "id",
|
||||||
|
'MOCK PROJECT'::text AS "title",
|
||||||
|
'EC'::text AS "funder",
|
||||||
|
'FP7'::text AS "fundingLevel0";
|
||||||
|
|
||||||
|
CREATE MATERIALIZED VIEW doc_project AS SELECT
|
||||||
|
'50|patents_____::'||MD5(lower(trim(p->>'LocalID'))) AS "docId",
|
||||||
|
'50|MOCK_PROJECT::'||MD5('EC_FP7') AS "projectId"
|
||||||
|
FROM (SELECT replace(json,'\\"','\"')::json AS p FROM patents_json) a;
|
|
@ -0,0 +1 @@
|
||||||
|
usare i documenti nella relativa directory
|
|
@ -0,0 +1,381 @@
|
||||||
|
|
||||||
|
alter table project_portfolio add column json json;
|
||||||
|
alter table project_portfolio add column administrative_data json;
|
||||||
|
alter table project_portfolio add column governance_data json;
|
||||||
|
|
||||||
|
--sections
|
||||||
|
alter table project_portfolio add column executive_summary json;
|
||||||
|
alter table project_portfolio add column final_report_summary json;
|
||||||
|
alter table project_portfolio add column impact json;
|
||||||
|
alter table project_portfolio add column objective json;
|
||||||
|
alter table project_portfolio add column title json;
|
||||||
|
--/sections
|
||||||
|
|
||||||
|
update project_portfolio set json = convert_from(decode(portfolio, 'base64'), 'UTF8')::json ;
|
||||||
|
update project_portfolio set administrative_data = json->'administrative_data';
|
||||||
|
update project_portfolio set governance_data = json->'governance_data';
|
||||||
|
update project_portfolio set executive_summary = json->'sections'->'executive_summary';
|
||||||
|
update project_portfolio set final_report_summary = json->'sections'->'final_report_summary';
|
||||||
|
update project_portfolio set impact = json->'sections'->'impact';
|
||||||
|
update project_portfolio set objective = json->'sections'->'objective';
|
||||||
|
|
||||||
|
update project_portfolio set results_in_brief = json->'sections'->'results_in_brief';
|
||||||
|
update project_portfolio set results = json->'sections'->'results';
|
||||||
|
update project_portfolio set impact = json->'sections'->'impact';
|
||||||
|
|
||||||
|
update project_portfolio set title = json->'sections'->'title';
|
||||||
|
|
||||||
|
-- document
|
||||||
|
INSERT INTO DOCUMENT
|
||||||
|
(id,
|
||||||
|
title,
|
||||||
|
abstract,
|
||||||
|
doctype,
|
||||||
|
repository,
|
||||||
|
rights,
|
||||||
|
pubyear)
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_object'),
|
||||||
|
'40|corda_______', '50|fp7___object') AS id,
|
||||||
|
'Objectives of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
objective ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE objective ->> 'text' IS NOT NULL
|
||||||
|
UNION ALL
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020summary'),
|
||||||
|
'40|corda_______', '50|fp7__summary') AS id,
|
||||||
|
'Final report summary of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
final_report_summary ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE final_report_summary ->> 'text' IS NOT NULL
|
||||||
|
UNION ALL
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020___exec'),
|
||||||
|
'40|corda_______', '50|fp7_____exec') AS id,
|
||||||
|
'Executive summary of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
executive_summary ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE executive_summary ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020__brief'),
|
||||||
|
'40|corda_______', '50|fp7____brief') AS id,
|
||||||
|
'Results in brief of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
results_in_brief ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE results_in_brief ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020results'),
|
||||||
|
'40|corda_______', '50|fp7__results') AS id,
|
||||||
|
'Results of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
results ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE results ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_impact'),
|
||||||
|
'40|corda_______', '50|fp7___impact') AS id,
|
||||||
|
'Impact of project '
|
||||||
|
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
|
||||||
|
impact ->> 'text' AS abstract,
|
||||||
|
'project_report' AS doctype,
|
||||||
|
'CORDIS' AS repository,
|
||||||
|
'OPEN' :: TEXT AS rights,
|
||||||
|
administrative_data ->> 'date_to' :: TEXT AS pubyear
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE impact ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
|
||||||
|
-- doc_project
|
||||||
|
INSERT INTO doc_project
|
||||||
|
(docid,
|
||||||
|
projectid,
|
||||||
|
inferred)
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_object'),
|
||||||
|
'40|corda_______', '50|fp7___object') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE objective ->> 'text' IS NOT NULL
|
||||||
|
UNION ALL
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020summary'),
|
||||||
|
'40|corda_______', '50|fp7__summary') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE final_report_summary ->> 'text' IS NOT NULL
|
||||||
|
UNION ALL
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020___exec'),
|
||||||
|
'40|corda_______', '50|fp7_____exec') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE executive_summary ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020__brief'),
|
||||||
|
'40|corda_______', '50|fp7____brief') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE results_in_brief ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020results'),
|
||||||
|
'40|corda_______', '50|fp7__results') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE results ->> 'text' IS NOT NULL
|
||||||
|
UNION ALL
|
||||||
|
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_impact'),
|
||||||
|
'40|corda_______', '50|fp7___impact') AS docid,
|
||||||
|
projectid,
|
||||||
|
TRUE AS inferred
|
||||||
|
FROM project_portfolio
|
||||||
|
WHERE impact ->> 'text' IS NOT NULL
|
||||||
|
|
||||||
|
-- updates the project table with data from the project_portfolios
|
||||||
|
update project p set (total_cost,contribution,currency) = ((administrative_data->>'Total cost')::numeric, (administrative_data->>'contribution')::numeric, 'EURO'::text) from
|
||||||
|
project_portfolio pp where pp.projectid = p.id ;
|
||||||
|
|
||||||
|
update project_organization po set (contribution, currency) = (U.contribution, 'EURO'::text) from
|
||||||
|
(
|
||||||
|
select projectid, '20|ec__________::'||MD5(o->>'pic') as orgid, (o->>'contribution')::numeric as contribution from
|
||||||
|
(
|
||||||
|
select projectid, json_array_elements(administrative_data->'coordinators') as o from project_portfolio
|
||||||
|
union all
|
||||||
|
select projectid, json_array_elements(administrative_data->'participants') as o from project_portfolio
|
||||||
|
) as T
|
||||||
|
) as U where po.orgid = U.orgid and po.projectid = U.projectid
|
||||||
|
|
||||||
|
-- include start/end dates from project portfolios
|
||||||
|
update project p set startdate = pp.administrative_data->>'date_from' from project_portfolio pp where p.startdate is null and p.id = pp.projectid ;
|
||||||
|
update project p set enddate = pp.administrative_data->>'date_to' from project_portfolio pp where p.enddate is null and p.id = pp.projectid ;
|
||||||
|
|
||||||
|
|
||||||
|
-- work in progress, waiting for ARC to fix the data. For now we keep only the activity types that do not contain any new-line characters
|
||||||
|
update project_organization po set activitytype = o.activitytype from (select '20|ec__________::'||MD5(o->>'pic') as orgid, o->>'activity_type' as activitytype from
|
||||||
|
(
|
||||||
|
select json_array_elements(administrative_data->'coordinators') as o from project_portfolio
|
||||||
|
union all
|
||||||
|
select json_array_elements(administrative_data->'participants') as o from project_portfolio
|
||||||
|
) as T
|
||||||
|
where o->>'pic' is not null and o->>'activity_type' !~ E'.*\n.*') o where po.orgid = o.orgid;
|
||||||
|
|
||||||
|
|
||||||
|
-- extract PubMed publications from the project portfolios
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | map([.key, .value.ArticleTitle, .value.AbstractText, .value.ArticleDate]) | .[] | @csv' {} \; > ../document_pp.csv
|
||||||
|
find . -name '*.json' -exec bash -c "jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as \$id | .value.Authors | to_entries | .[] | .key as \$i | { docid : \$id, fullname : (.value.LastName+\", \"+.value.ForeName), rank: (map(\$i+1) | unique | .[0]) } ) ' \"{}\" | jq -s -r 'map([.docid, .fullname, .rank]) | .[] | @csv' " \; > ../doc_author_pp.csv
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.OtherIDs | map([$id, .Source, .id ] )) | .[] | @csv ' {} \; > ../doc_other_id.csv
|
||||||
|
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv 15:50:53
|
||||||
|
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.MeshHeadings | map([$id, (group_by(.Label) | .[] )])) | map([.[0], .[1][0].text, ([(.[2][]?.text )] | join("@")) ]) | .[] | @csv ' {} \; > ../doc_subject_pp.csv
|
||||||
|
|
||||||
|
//DOCUMENTS
|
||||||
|
create table document_pp(id text, title text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN', doctype text default 'publication');
|
||||||
|
copy document_pp (id, title, abstract, pubyear) from '/Users/claudio/workspace/data/d4i/document_pp.csv' CSV ;
|
||||||
|
create table document_pp_unique as (select distinct * from document_pp );
|
||||||
|
drop table document_pp;
|
||||||
|
alter table document_pp_unique rename to document_pp ;
|
||||||
|
update document_pp set repository = 'PubMed Central PP' ;
|
||||||
|
update document_pp set pubyear = to_date(pubyear, 'DD/MM/YYYY')::text ;
|
||||||
|
update document_pp set id= '50|pp_______267::'||MD5(id) ;
|
||||||
|
|
||||||
|
// DOC_AUTHOR
|
||||||
|
create table doc_author_pp (docid text, fullname text, rank integer);
|
||||||
|
copy doc_author_pp (docid, fullname, rank) from '/Users/claudio/workspace/data/d4i/doc_author_pp.csv' CSV ;
|
||||||
|
update doc_author_pp set fullname = SUBSTRING(fullname, 0, length(fullname) + 1 - 2) where fullname like '%, ';
|
||||||
|
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
|
||||||
|
drop table doc_author_pp;
|
||||||
|
alter table doc_author_pp_u rename to doc_author_pp ;
|
||||||
|
update doc_author_pp set docid = '50|pp_______267::'||MD5(docid) ;
|
||||||
|
|
||||||
|
// DOC_SUBJECT
|
||||||
|
create table doc_subject_pp(docid text, subject text, typology text);
|
||||||
|
create table subject_tmp(id text, descriptor text, qualifiers text);
|
||||||
|
copy subject_tmp (id, descriptor, qualifiers) from '/Users/claudio/workspace/data/d4i/doc_subject_pp.csv' CSV;
|
||||||
|
insert into doc_subject_pp select '50|pp_______267::'||MD5(id) as docid, s as subject, 'MeshHeadings' as typology from ( select id, d||'|'||q as s from ( select id, descriptor as d, unnest(regexp_split_to_array(qualifiers, '@')) as q from subject_tmp where qualifiers <> '') as t UNION ALL select distinct id, descriptor as s from subject_tmp) as t ;
|
||||||
|
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
|
||||||
|
drop table doc_subject_pp;
|
||||||
|
alter table doc_subject_pp_u rename to doc_subject_pp;
|
||||||
|
|
||||||
|
// DOC_PROJECT
|
||||||
|
create table doc_project_pp(docid text, projectid text);
|
||||||
|
create table dp_tmp (docid text, profix text, grantid text) ;
|
||||||
|
copy dp_tmp(docid, profix, grantid) from '/Users/claudio/workspace/data/d4i/doc_project_pp.csv' CSV;
|
||||||
|
insert into doc_project_pp select '50|pp_______267::'||MD5(docid), profix||MD5(grantid) from dp_tmp ;
|
||||||
|
|
||||||
|
// DOC_OTHER_IDENTIFIER
|
||||||
|
create table doc_other_identifier_pp(docid text, idtype text, id text);
|
||||||
|
copy doc_other_identifier_pp (docid, idtype, id) from '/Users/claudio/workspace/data/d4i/doc_other_id.csv' CSV;
|
||||||
|
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
|
||||||
|
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
|
||||||
|
update doc_other_identifier_pp set docid= '50|pp_______267::'||MD5(docid);
|
||||||
|
|
||||||
|
|
||||||
|
// Caricamento dei csv sul db, cleaning degli idtype, generazione dei subject (mesh), distinct values, ...
|
||||||
|
|
||||||
|
|
||||||
|
create table doc_alias_pp(id text, idpp text);
|
||||||
|
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
|
||||||
|
|
||||||
|
alter table document_pp add column existing_docid text;
|
||||||
|
alter table doc_other_identifier_pp add column existing_docid text;
|
||||||
|
alter table doc_author_pp add column existing_docid text;
|
||||||
|
alter table doc_project_pp add column existing_docid text;
|
||||||
|
alter table doc_subject_pp add column existing_docid text;
|
||||||
|
|
||||||
|
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
|
||||||
|
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
|
||||||
|
|
||||||
|
update document_pp set id = existing_docid where existing_docid is not null;
|
||||||
|
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_author_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_project_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
|
||||||
|
alter table document_pp drop column existing_docid ;
|
||||||
|
alter table doc_other_identifier_pp drop column existing_docid ;
|
||||||
|
alter table doc_author_pp drop column existing_docid ;
|
||||||
|
alter table doc_project_pp drop column existing_docid ;
|
||||||
|
alter table doc_subject_pp drop column existing_docid ;
|
||||||
|
|
||||||
|
-- ONLY FOR MISSING DOCUMENT
|
||||||
|
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp_______267::%';
|
||||||
|
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp_______267::%' on conflict do nothing;
|
||||||
|
|
||||||
|
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
|
||||||
|
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
|
||||||
|
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
|
||||||
|
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;
|
||||||
|
|
||||||
|
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
-- extract RestPublications publications from the project portfolios
|
||||||
|
|
||||||
|
//--PART 1 - to be run in local
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | map([.key, .value.title, .value.resulttype, .value.description, .value.dateofacceptance]) | .[] | @csv' {} \; > ../document_pp.csv
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.creators | map([$id, .full, .rank])) | .[] | @csv' {} \; > ../doc_author_pp.csv
|
||||||
|
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv
|
||||||
|
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.subjects | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_subject_pp.csv
|
||||||
|
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.pids | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_other_id.csv
|
||||||
|
|
||||||
|
|
||||||
|
//DOCUMENTS
|
||||||
|
drop table if exists document_pp;
|
||||||
|
create table document_pp(id text, title text, doctype text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN');
|
||||||
|
copy document_pp (id, title, doctype, abstract, pubyear) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/document_pp.csv' CSV ;
|
||||||
|
create table document_pp_unique as (select distinct * from document_pp );
|
||||||
|
drop table document_pp;
|
||||||
|
alter table document_pp_unique rename to document_pp ;
|
||||||
|
update document_pp set repository = 'Rest Publications PP' ;
|
||||||
|
update document_pp set id= '50|pp__restpubs::'||MD5(id);
|
||||||
|
|
||||||
|
// DOC_AUTHOR
|
||||||
|
drop table if exists doc_author_pp;
|
||||||
|
create table doc_author_pp (docid text, fullname text, rank integer);
|
||||||
|
copy doc_author_pp (docid, fullname, rank) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_author_pp.csv' CSV ;
|
||||||
|
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
|
||||||
|
drop table doc_author_pp;
|
||||||
|
alter table doc_author_pp_u rename to doc_author_pp ;
|
||||||
|
update doc_author_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
|
||||||
|
|
||||||
|
// DOC_PROJECT
|
||||||
|
drop table if exists doc_project_pp;
|
||||||
|
create table doc_project_pp(docid text, projectid text);
|
||||||
|
create table dp_tmp (docid text, prefix text, grantid text) ;
|
||||||
|
copy dp_tmp(docid, prefix, grantid) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_project_pp.csv' CSV;
|
||||||
|
insert into doc_project_pp select distinct '50|pp__restpubs::'||MD5(docid), prefix||MD5(grantid) from dp_tmp ;
|
||||||
|
|
||||||
|
// DOC_SUBJECT
|
||||||
|
drop table if exists doc_subject_pp;
|
||||||
|
create table doc_subject_pp(docid text, subject text, typology text);
|
||||||
|
copy doc_subject_pp (docid, subject, typology) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_subject_pp.csv' CSV;
|
||||||
|
delete from doc_subject_pp where subject is null OR subject = '';
|
||||||
|
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
|
||||||
|
drop table doc_subject_pp;
|
||||||
|
alter table doc_subject_pp_u rename to doc_subject_pp;
|
||||||
|
update doc_subject_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
|
||||||
|
|
||||||
|
// DOC_OTHER_IDENTIFIER
|
||||||
|
drop table if exists doc_other_identifier_pp;
|
||||||
|
create table doc_other_identifier_pp(docid text, id text, idtype text);
|
||||||
|
copy doc_other_identifier_pp (docid, id, idtype) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_other_id.csv' CSV;
|
||||||
|
delete from doc_other_identifier_pp where id is null OR id = '';
|
||||||
|
create table doc_other_identifier_pp_u as select distinct * from doc_other_identifier_pp;
|
||||||
|
drop table doc_other_identifier_pp;
|
||||||
|
alter table doc_other_identifier_pp_u rename to doc_other_identifier_pp;
|
||||||
|
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
|
||||||
|
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
|
||||||
|
update doc_other_identifier_pp set docid = '50|pp__restpubs::'||MD5(docid);
|
||||||
|
|
||||||
|
// -- PART 2 - to be run on the server
|
||||||
|
create table doc_alias_pp(id text, idpp text);
|
||||||
|
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
|
||||||
|
|
||||||
|
alter table document_pp add column existing_docid text;
|
||||||
|
alter table doc_other_identifier_pp add column existing_docid text;
|
||||||
|
alter table doc_author_pp add column existing_docid text;
|
||||||
|
alter table doc_project_pp add column existing_docid text;
|
||||||
|
alter table doc_subject_pp add column existing_docid text;
|
||||||
|
|
||||||
|
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
|
||||||
|
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
|
||||||
|
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
|
||||||
|
|
||||||
|
update document_pp set id = existing_docid where existing_docid is not null;
|
||||||
|
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_author_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_project_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
|
||||||
|
|
||||||
|
alter table document_pp drop column existing_docid ;
|
||||||
|
alter table doc_other_identifier_pp drop column existing_docid ;
|
||||||
|
alter table doc_author_pp drop column existing_docid ;
|
||||||
|
alter table doc_project_pp drop column existing_docid ;
|
||||||
|
alter table doc_subject_pp drop column existing_docid ;
|
||||||
|
|
||||||
|
-- ONLY FOR MISSING DOCUMENT
|
||||||
|
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select distinct id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp__restpubs::%';
|
||||||
|
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp__restpubs::%' on conflict do nothing;
|
||||||
|
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
|
||||||
|
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
|
||||||
|
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
|
||||||
|
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
SAVEIFS=$IFS
|
||||||
|
IFS=$(echo -en "\n\b")
|
||||||
|
|
||||||
|
tmp="/tmp/tempfile.sql"
|
||||||
|
rm -f "$tmp"
|
||||||
|
|
||||||
|
echo "DELETE FROM project_portfolio;" >> "$tmp"
|
||||||
|
|
||||||
|
for f in `ls /data/d4i/project_portfolios/november2018/D4I_Analytics_ARC_Release04_WP52_31Nov2018_fixed/FP7_*.json`
|
||||||
|
do
|
||||||
|
id=$(jq .administrative_data.project_id "$f" | tr -d '"')
|
||||||
|
echo -n "INSERT INTO project_portfolio(projectid, portfolio) VALUES ('40|corda_______::'||MD5('$id'), '" >> "$tmp"
|
||||||
|
cat "$f" | gzip -c | base64 | tr -d '\n' >> "$tmp"
|
||||||
|
echo "');" >> "$tmp"
|
||||||
|
done
|
||||||
|
|
||||||
|
for f in `ls /data/d4i/project_portfolios/november2018/D4I_Analytics_ARC_Release04_WP52_31Nov2018_fixed/H2020_*.json`
|
||||||
|
do
|
||||||
|
id=$(jq .administrative_data.project_id "$f" | tr -d '"')
|
||||||
|
echo -n "INSERT INTO project_portfolio(projectid, portfolio) VALUES ('40|corda__h2020::'||MD5('$id'), '" >> "$tmp"
|
||||||
|
cat "$f" | gzip -c | base64 | tr -d '\n' >> "$tmp"
|
||||||
|
echo "');" >> "$tmp"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Inserting file: $tmp"
|
||||||
|
|
||||||
|
#psql data4impact -f "$tmp"
|
||||||
|
|
||||||
|
IFS=$SAVEIFS
|
|
@ -0,0 +1,199 @@
|
||||||
|
CREATE TABLE pp_metrics (
|
||||||
|
id text PRIMARY KEY,
|
||||||
|
eu_contribution numeric,
|
||||||
|
number_of_innovations integer,
|
||||||
|
number_of_companies_founded integer,
|
||||||
|
number_of_patents integer,
|
||||||
|
number_of_projects integer,
|
||||||
|
number_of_pubmed_publications integer,
|
||||||
|
number_of_rest_publications integer,
|
||||||
|
number_of_segments integer,
|
||||||
|
total_cost numeric
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_countries_cooccurrences (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
country1 text,
|
||||||
|
country2 text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, country1, country2)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_eu_contribution_per_country (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
country text,
|
||||||
|
contribution numeric,
|
||||||
|
PRIMARY KEY (funding, country)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_eu_contribution_per_participant_sector (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
sector text,
|
||||||
|
contribution numeric,
|
||||||
|
PRIMARY KEY (funding, sector)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_eu_contribution_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
contribution numeric,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_eu_contribution_per_research_area_over_time (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
year integer,
|
||||||
|
contribution numeric,
|
||||||
|
PRIMARY KEY (funding, area, year)
|
||||||
|
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_eu_contribution_per_year (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
year integer,
|
||||||
|
contribution numeric,
|
||||||
|
PRIMARY KEY (funding, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_innovations_per_type (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
type text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, type)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_innovations_per_type_per_country (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
type text,
|
||||||
|
country text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, type, country)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_innovations_per_type_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
type text,
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, type, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_patents_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_projects_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_country (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
country text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, country)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_journal (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
journal text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, journal)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_journal_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
journal text,
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, journal, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_journal_per_year (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
journal text,
|
||||||
|
year integer,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, journal, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_year (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
year integer,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- IT IS EQUIVALENT TO pp_number_of_pubmed_publications_per_journal_per_year --
|
||||||
|
CREATE TABLE pp_number_of_pubmed_publications_per_year_per_journal (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
journal text,
|
||||||
|
year integer,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, journal, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_rest_publications_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_number_of_rest_publications_per_year (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
year integer,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_research_areas_cooccurrences (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area1 text,
|
||||||
|
area2 text,
|
||||||
|
number integer,
|
||||||
|
PRIMARY KEY (funding, area1, area2)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_research_areas_to_icd10 (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
icd10 text,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_total_cost_per_research_area (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
cost numeric,
|
||||||
|
PRIMARY KEY (funding, area)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_total_cost_per_research_area_over_time (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
area text,
|
||||||
|
year integer,
|
||||||
|
cost numeric,
|
||||||
|
PRIMARY KEY (funding, area, year)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE pp_total_cost_per_year (
|
||||||
|
funding text REFERENCES pp_metrics(id),
|
||||||
|
year integer,
|
||||||
|
cost numeric,
|
||||||
|
PRIMARY KEY (funding, year)
|
||||||
|
);
|
||||||
|
|
|
@ -0,0 +1,194 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
file=/Users/claudio/workspace/data/d4i/november2018/D4I_Metrics_ARC_Release04_WP52_31Nov2018/statistics_on_release.json
|
||||||
|
db=metrics_tmp
|
||||||
|
|
||||||
|
|
||||||
|
echo "Recreating the database $db"
|
||||||
|
dropdb $db --if-exists
|
||||||
|
createdb $db
|
||||||
|
psql $db -f schema.sql
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_metrics"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | map([.key, .value.eu_contribution, .value.number_of_innovations, .value.number_of_companies_founded, .value.number_of_patents, .value.number_of_projects, .value.number_of_pubmed_publications, .value.number_of_rest_publications, .value.number_of_segments, .value.total_cost]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_metrics(id,eu_contribution,number_of_innovations,number_of_companies_founded,number_of_patents,number_of_projects,number_of_pubmed_publications,number_of_rest_publications,number_of_segments,total_cost) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_countries_cooccurrences"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.countries_cooccurrences | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_countries_cooccurrences(funding,country1,country2,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_eu_contribution_per_country"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_country | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_eu_contribution_per_country(funding,country,contribution) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_eu_contribution_per_participant_sector"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_participant_sector | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_eu_contribution_per_participant_sector(funding,sector,contribution) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_eu_contribution_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_eu_contribution_per_research_area(funding,area,contribution) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_eu_contribution_per_research_area_over_time"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_research_area_over_time | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_eu_contribution_per_research_area_over_time(funding,year,area,contribution) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_eu_contribution_per_year"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.eu_contribution_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_eu_contribution_per_year(funding,year,contribution) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_innovations_per_type"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_innovations_per_type(funding,type,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_innovations_per_type_per_country"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type_per_country | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_innovations_per_type_per_country(funding,country,type,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_innovations_per_type_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_innovations_per_type_per_research_area | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_innovations_per_type_per_research_area(funding,area,type,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_patents_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_patents_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_patents_per_research_area(funding,area,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_projects_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_projects_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_projects_per_research_area(funding,area,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_country"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_country | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_country(funding,country,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_journal"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal(funding,journal,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_journal_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal_per_research_area | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal_per_research_area(funding,journal,area,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_journal_per_year"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_journal_per_year | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_journal_per_year(funding,journal,year,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_research_area(funding,area,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_year"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_year(funding,year,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_pubmed_publications_per_year_per_journal"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_pubmed_publications_per_year_per_journal | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_pubmed_publications_per_year_per_journal(funding,year,journal,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_rest_publications_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_rest_publications_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_rest_publications_per_research_area(funding,area,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_number_of_rest_publications_per_year"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.number_of_rest_publications_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_number_of_rest_publications_per_year(funding,year,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_research_areas_cooccurrences"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.research_areas_cooccurrences | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_research_areas_cooccurrences(funding,area1,area2,number) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_research_areas_to_icd10"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.research_areas_to_icd10 | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_research_areas_to_icd10(funding,area,icd10) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_total_cost_per_research_area"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_research_area | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_total_cost_per_research_area(funding,area,cost) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_total_cost_per_research_area_over_time"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_research_area_over_time | to_entries | map([.key, (.value | to_entries | map([.key, .value ]))]))])) | .[] | to_entries | .[0].value as $id | .[1].value[] | to_entries | .[0].value as $x | .[1].value | map([$id,$x,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_total_cost_per_research_area_over_time(funding,year,area,cost) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Importing table pp_total_cost_per_year"
|
||||||
|
cat $file \
|
||||||
|
| jq -r 'to_entries | (map([.key, (.value.total_cost_per_year | to_entries | map([.key, .value] ))]) ) | .[] | to_entries | .[0].value as $id | .[1].value | map([$id,.[0],.[1]]) | .[] | @csv' \
|
||||||
|
| sed -e 's/"null"/-1/' \
|
||||||
|
| psql $db -c "COPY pp_total_cost_per_year(funding,year,cost) FROM STDIN CSV"
|
||||||
|
echo
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
insert into doc_fulltext(docid, fulltext) select d.id as docid, t.fulltext as fulltext from document d left outer join temp_fulltext t on (d.id = t.pubid) where t.fulltext is not null;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
DONE using a java application
|
||||||
|
|
||||||
|
cat pubmed.json | while read -r line; do echo $line | jq '.body["$binary"]' | sed 's/"//g' | base64 -d | gunzip -c; done
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
token AS "id",
|
||||||
|
'doi' AS "type"
|
||||||
|
FROM projects p , unnest(string_to_array(p.doi_list, ',')) s(token)
|
||||||
|
WHERE token IS NOT NULL
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,6 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT distinct
|
||||||
|
'20|swedish_orgs::'||MD5(lower(organizations_coordinating_en)) AS "id",
|
||||||
|
organizations_coordinating_en AS "name",
|
||||||
|
'SE' AS "country"
|
||||||
|
FROM projects
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,8 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (
|
||||||
|
SELECT
|
||||||
|
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
|
||||||
|
token AS "docId",
|
||||||
|
'doi' AS "docIdType"
|
||||||
|
FROM projects p , unnest(string_to_array(p.doi_list, ',')) s(token)
|
||||||
|
WHERE token IS NOT NULL
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,9 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
|
||||||
|
'20|swedish_orgs::'||MD5(lower(organizations_coordinating_en)) AS "orgId",
|
||||||
|
'coordinator' AS "role",
|
||||||
|
people_project_leaders_0_firstname AS "contactFirstNames",
|
||||||
|
people_project_leaders_0_surname AS "contactLastNames"
|
||||||
|
FROM projects
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "projectId",
|
||||||
|
dnr AS "id",
|
||||||
|
lower(organization_short)||':grant_id' AS "type"
|
||||||
|
FROM projects
|
||||||
|
) t) TO STDOUT;
|
|
@ -0,0 +1,19 @@
|
||||||
|
COPY (SELECT row_to_json(t) FROM (SELECT
|
||||||
|
'40|'||rpad(lower(organization_short),12,'_')||'::'||MD5(dnr) AS "id",
|
||||||
|
title_en AS "title",
|
||||||
|
organization_short AS "funder",
|
||||||
|
type_of_awards AS "fundingLevel0",
|
||||||
|
dates_start_date AS "startDate",
|
||||||
|
dates_end_date AS "endDate",
|
||||||
|
abstract_en AS "abstractText",
|
||||||
|
tags_0_en AS "keywords",
|
||||||
|
total_funding AS "contribution",
|
||||||
|
'SEK'::text AS "currency"
|
||||||
|
FROM projects
|
||||||
|
) t) TO STDOUT;
|
||||||
|
|
||||||
|
|
||||||
|
-- intrascientific_report_en text,
|
||||||
|
-- popular_report_sv text,
|
||||||
|
-- doi_list text,
|
||||||
|
-- total_funding text
|
|
@ -0,0 +1,22 @@
|
||||||
|
CREATE TABLE projects (
|
||||||
|
swecris_info text,
|
||||||
|
doi text,
|
||||||
|
final_reports text,
|
||||||
|
Organization_short text,
|
||||||
|
Organization_long text,
|
||||||
|
dnr text,
|
||||||
|
people_project_leaders_0_surname text,
|
||||||
|
people_project_leaders_0_firstname text,
|
||||||
|
organizations_coordinating_en text,
|
||||||
|
type_of_awards text,
|
||||||
|
dates_start_date text,
|
||||||
|
dates_end_date text,
|
||||||
|
title_en text,
|
||||||
|
abstract_en text,
|
||||||
|
intrascientific_report_en text,
|
||||||
|
popular_report_sv text,
|
||||||
|
tags_0_en text,
|
||||||
|
doi_list text,
|
||||||
|
total_funding numeric
|
||||||
|
);
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
csv=/tmp/180626-swe_proj_data-delivery.csv
|
||||||
|
|
||||||
|
inputCsvFile="$(cd "$(dirname "$csv")"; pwd -P)/$(basename "$csv")"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Swedish Projects Import:"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Recreating the swedishprojects database"
|
||||||
|
dropdb swedishprojects --if-exists;
|
||||||
|
createdb swedishprojects;
|
||||||
|
psql swedishprojects -f schema.sql
|
||||||
|
psql swedishprojects -c "COPY projects(swecris_info, doi, final_reports, Organization_short, Organization_long, dnr, people_project_leaders_0_surname, people_project_leaders_0_firstname, organizations_coordinating_en, type_of_awards, dates_start_date, dates_end_date, title_en, abstract_en, intrascientific_report_en, popular_report_sv, tags_0_en, doi_list, total_funding) FROM '$inputCsvFile' DELIMITER ',' CSV HEADER;"
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
echo " - Generating json files"
|
||||||
|
rm -f ../../jsonfiles/swedishProjects/*.json
|
||||||
|
psql swedishprojects -f projects2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/project.json
|
||||||
|
psql swedishprojects -f orgs2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/organization.json
|
||||||
|
psql swedishprojects -f projOrg2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectOrganization.json
|
||||||
|
psql swedishprojects -f projOtherIds2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectOtherId.json
|
||||||
|
psql swedishprojects -f docOtherId2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/docotherid.json
|
||||||
|
psql swedishprojects -f projDoi2json.sql | sed 's/\\\\/\\/g' > ../../jsonfiles/swedishProjects/projectdocotherid.json
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo
|
||||||
|
|
|
@ -0,0 +1,93 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>data4impact-importer</artifactId>
|
||||||
|
<version>1.1.0-SNAPSHOT</version>
|
||||||
|
<!-- <scm> <developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/data4impact/data4impact-importer/trunk</developerConnection>
|
||||||
|
</scm> <ciManagement> <system>jenkins</system> <url>https://jenkins-dnet.d4science.org/view/data4impact/job/data4impact-importer/</url>
|
||||||
|
</ciManagement> <distributionManagement> <repository> <id>dnet45-releases</id>
|
||||||
|
<name>D-Net 45 Releases</name> <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||||
|
<layout>default</layout> </repository> </distributionManagement> -->
|
||||||
|
<!-- Inherit defaults from Spring Boot -->
|
||||||
|
<parent>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-parent</artifactId>
|
||||||
|
<version>2.0.3.RELEASE</version>
|
||||||
|
<relativePath></relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<!-- <repositories> <repository> <id>dnet-deps</id> <name>dnet-dependencies</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet-deps</url>
|
||||||
|
<layout>default</layout> </repository> <repository> <id>dnet45-releases</id>
|
||||||
|
<name>D-Net 45 Releases</name> <url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
|
||||||
|
<layout>default</layout> <snapshots> <enabled>true</enabled> </snapshots>
|
||||||
|
</repository> <repository> <id>dnet45-snapshots</id> <name>D-Net 45 Snapshots</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
|
||||||
|
<layout>default</layout> <snapshots> <enabled>true</enabled> </snapshots>
|
||||||
|
</repository> </repositories> -->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Add typical dependencies for a web application -->
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>data4impact-model</artifactId>
|
||||||
|
<version>1.1.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.reflections</groupId>
|
||||||
|
<artifactId>reflections</artifactId>
|
||||||
|
<version>0.9.11</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-annotations</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-databind</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- JUnit -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<executable>true</executable>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<java.version>1.8</java.version>
|
||||||
|
<apache.solr.version>7.1.0</apache.solr.version>
|
||||||
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
|
<springfox-version>2.8.0</springfox-version>
|
||||||
|
<prometheus.version>0.2.0</prometheus.version>
|
||||||
|
<javamelody.version>1.71.0</javamelody.version>
|
||||||
|
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||||
|
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
|
||||||
|
</properties>
|
||||||
|
</project>
|
|
@ -0,0 +1,88 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import javax.persistence.EntityManagerFactory;
|
||||||
|
import javax.transaction.Transactional;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.context.ApplicationContext;
|
||||||
|
import org.springframework.data.jpa.repository.JpaRepository;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class Data4ImpactImporter {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private ApplicationContext applicationContext;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private EntityManagerFactory entityManagerFactory;
|
||||||
|
|
||||||
|
private final ObjectMapper jsonMapper = new ObjectMapper();
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public <T> void importFileJson(final Path file, final Class<T> tableClass) {
|
||||||
|
try {
|
||||||
|
final LocalDateTime start = LocalDateTime.now();
|
||||||
|
final JpaRepository<T, ?> repo = findRepositorForTable(tableClass);
|
||||||
|
|
||||||
|
Files.lines(file, StandardCharsets.UTF_8).forEach(l -> processLine(l, tableClass, repo));
|
||||||
|
|
||||||
|
final LocalDateTime end = LocalDateTime.now();
|
||||||
|
final double time = Duration.between(start, end).toNanos() / 1000000000.0;
|
||||||
|
|
||||||
|
System.out.printf("\nDone in %.3f sec.\n\n", time);
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private <T, K> void processLine(final String line, final Class<T> tableClass, final JpaRepository<T, K> repo) {
|
||||||
|
try {
|
||||||
|
final T obj = jsonMapper.readValue(line, tableClass);
|
||||||
|
final K id = (K) entityManagerFactory.getPersistenceUnitUtil().getIdentifier(obj);
|
||||||
|
processObject(obj, id, repo);
|
||||||
|
} catch (final IOException | IllegalAccessException | InstantiationException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private <T, K> void processObject(final T obj, final K id, final JpaRepository<T, K> repo) throws IllegalAccessException, InstantiationException {
|
||||||
|
System.out.println(id);
|
||||||
|
final Optional<T> old = repo.findById(id);
|
||||||
|
if (old.isPresent()) {
|
||||||
|
repo.save(ObjectMerger.mergeObjects(old.get(), obj));
|
||||||
|
} else {
|
||||||
|
repo.save(obj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Transactional
|
||||||
|
public <T> void importFileXML(final String file, final Class<?> tableClass) {
|
||||||
|
throw new RuntimeException("-- NOT IMPLEMENTED --");
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private <T, K> JpaRepository<T, K> findRepositorForTable(final Class<T> clazz) {
|
||||||
|
final String repoName = clazz.getSimpleName() + "Repository";
|
||||||
|
|
||||||
|
return applicationContext.getBeansOfType(JpaRepository.class)
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(e -> e.getKey().equalsIgnoreCase(repoName))
|
||||||
|
.map(e -> e.getValue())
|
||||||
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new RuntimeException("No repository found for class " + clazz.getName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,99 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.persistence.Table;
|
||||||
|
|
||||||
|
import org.reflections.Reflections;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.CommandLineRunner;
|
||||||
|
import org.springframework.boot.SpringApplication;
|
||||||
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
|
|
||||||
|
import eu.data4impact.utils.DatabaseUtils;
|
||||||
|
|
||||||
|
@SpringBootApplication
|
||||||
|
public class Data4ImpactImporterApplication implements CommandLineRunner {
|
||||||
|
|
||||||
|
// private static final Logger log = LoggerFactory.getLogger(Data4ImpactImporterApplication.class);
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private Data4ImpactImporter importer;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private DatabaseUtils databaseUtils;
|
||||||
|
|
||||||
|
public static void main(final String... args) {
|
||||||
|
SpringApplication.run(Data4ImpactImporterApplication.class, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run(final String... args) {
|
||||||
|
|
||||||
|
final Map<String, Class<?>> validEntities = validEntities();
|
||||||
|
|
||||||
|
if (args.length == 0) {
|
||||||
|
printHelp();
|
||||||
|
printValidFiles(validEntities);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (final String f : args) {
|
||||||
|
if (f.toLowerCase().endsWith(".json")) {
|
||||||
|
System.out.println("Processing file: " + f);
|
||||||
|
final Path path = Paths.get(f);
|
||||||
|
final String fileName = path.getFileName().toString();
|
||||||
|
final String entityName = fileName.substring(0, fileName.lastIndexOf('.')).toLowerCase();
|
||||||
|
if (validEntities.containsKey(entityName)) {
|
||||||
|
importer.importFileJson(path, validEntities.get(entityName));
|
||||||
|
} else {
|
||||||
|
System.err.println("\n[ERROR] Entity not found for file " + f);
|
||||||
|
printValidFiles(validEntities);
|
||||||
|
System.exit(-1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
System.err.println("\nNot a json file: " + f);
|
||||||
|
printValidFiles(validEntities);
|
||||||
|
System.exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Refreshing views...");
|
||||||
|
databaseUtils.refreshMaterializedViews(v -> System.out.println(" - " + v));
|
||||||
|
System.out.println("Done.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printHelp() {
|
||||||
|
System.out.println();
|
||||||
|
System.out.println("Missing input files !");
|
||||||
|
System.out.println();
|
||||||
|
System.out.println("Example: java -jar file1.json file2.json ...");
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printValidFiles(final Map<String, Class<?>> validEntities) {
|
||||||
|
System.out.println("\nValid filenames are (ignore case):\n" +
|
||||||
|
validEntities.keySet()
|
||||||
|
.stream()
|
||||||
|
.collect(Collectors.groupingBy(validEntities::get))
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.map(e -> String.format(" - For class %s: %s\n",
|
||||||
|
e.getKey().getSimpleName(),
|
||||||
|
e.getValue().stream().collect(Collectors.joining(".json, ")) + ".json"))
|
||||||
|
.collect(Collectors.joining()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, Class<?>> validEntities() {
|
||||||
|
final Map<String, Class<?>> res = new HashMap<>();
|
||||||
|
for (final Class<?> cl : new Reflections("eu.data4impact.model").getTypesAnnotatedWith(Table.class)) {
|
||||||
|
res.put(cl.getSimpleName().toLowerCase(), cl);
|
||||||
|
res.put(cl.getAnnotation(Table.class).name().toLowerCase(), cl);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import java.lang.reflect.Field;
|
||||||
|
import java.lang.reflect.Modifier;
|
||||||
|
|
||||||
|
public class ObjectMerger {
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public static <T> T mergeObjects(final T first, final T second) throws IllegalAccessException, InstantiationException {
|
||||||
|
final Class<?> clazz = first.getClass();
|
||||||
|
final Field[] fields = clazz.getDeclaredFields();
|
||||||
|
final T res = (T) clazz.newInstance();
|
||||||
|
for (final Field f : fields) {
|
||||||
|
if (!Modifier.isFinal(f.getModifiers())) {
|
||||||
|
f.setAccessible(true);
|
||||||
|
final Object v1 = f.get(first);
|
||||||
|
final Object v2 = f.get(second);
|
||||||
|
f.set(res, (v2 != null ? v2 : v1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
spring.main.banner-mode = off
|
||||||
|
logging.level.root = WARN
|
||||||
|
|
||||||
|
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
|
||||||
|
spring.datasource.username=
|
||||||
|
spring.datasource.password=
|
||||||
|
|
||||||
|
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
|
||||||
|
|
||||||
|
# Hibernate ddl auto (create, create-drop, validate, update)
|
||||||
|
spring.jpa.hibernate.ddl-auto = validate
|
||||||
|
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
|
||||||
|
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
|
||||||
|
spring.jpa.open-in-view=true
|
|
@ -0,0 +1,26 @@
|
||||||
|
package eu.data4impact;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import eu.data4impact.model.projects.Project;
|
||||||
|
|
||||||
|
public class ObjectMergerTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
|
final Project p1 = new Project();
|
||||||
|
final Project p2 = new Project();
|
||||||
|
p2.setEcSc39(true);
|
||||||
|
|
||||||
|
final Project p3 = ObjectMerger.mergeObjects(p1, p2);
|
||||||
|
final Project p4 = ObjectMerger.mergeObjects(p2, p1);
|
||||||
|
|
||||||
|
assertTrue(p3.getEcSc39());
|
||||||
|
assertTrue(p4.getEcSc39());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
spring.main.banner-mode = off
|
||||||
|
logging.level.root = WARN
|
||||||
|
|
||||||
|
spring.datasource.url=jdbc:postgresql://localhost:5432/data4impact
|
||||||
|
spring.datasource.username=
|
||||||
|
spring.datasource.password=
|
||||||
|
|
||||||
|
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
|
||||||
|
|
||||||
|
# Hibernate ddl auto (create, create-drop, validate, update)
|
||||||
|
spring.jpa.hibernate.ddl-auto = validate
|
||||||
|
spring.jpa.properties.hibernate.hbm2dll.extra_physical_table_types = MATERIALIZED VIEW
|
||||||
|
spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation=true
|
||||||
|
spring.jpa.open-in-view=true
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue