data4impact/apps/data4impact-import-scripts/scripts/patent_excel/process.sh

44 lines
1.6 KiB
Bash
Executable File

#!/bin/bash
excelFile="../../orig/patents/FP7_patents_full_list_Except_for_ICT.xlsx"
workdir=/tmp/patentsExcel
rm -rf "$workdir" && mkdir "$workdir"
echo
echo "Patents Import:"
#--------------------------------
echo " - Generating csv file"
csv="$workdir/patents.csv"
xlsx2csv -c UTF-8 "$excelFile" > $csv
#--------------------------------
echo " - Recreating the patents_excel database"
dropdb patents_excel --if-exists;
createdb patents_excel;
psql patents_excel -f schema.sql
if [[ -f "$csv" ]]; then
echo " - Importing data: $csv"
psql patents_excel -c "COPY data(pat_id,type_ip,appnum,appnt,title,pat_url,pat_ref,pat_auth,pat_num,pat_kind,note,appln_id,appln_title_patstat,priority_year,var15,projectid) FROM '$csv' CSV HEADER;"
else
echo " - Invalid file: $csv"
fi
psql patents -c "REFRESH MATERIALIZED VIEW document"
psql patents -c "REFRESH MATERIALIZED VIEW doc_other_identifier"
psql patents -c "REFRESH MATERIALIZED VIEW doc_project"
#--------------------------------
echo " - Generating json files"
rm -f ../../jsonfiles/patents_excel/*.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM document ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/document.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_other_identifier) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_other_identifier.json
#psql patents -c "COPY (SELECT row_to_json(t) FROM (SELECT * FROM doc_project ) t) TO STDOUT" | sed 's/\\\\/\\/g' > ../../jsonfiles/patents_excel/doc_project.json
echo "Done."
echo