data4impact/apps/data4impact-import-scripts/scripts/news-blogs-forum-twitter/script.sh

212 lines
8.2 KiB
Bash

# MANUAL STEPS FOR news-blogs-forum
1) cd /data/ftp/d4i/social_data/news-blogs-forum
2) find *.zip -exec bash -c "unzip -p {} | jq --slurp -r 'map([.content,.actor,.topicId,.mediatype,.source,.headline,.url,.dt,.language,.country]) | .[] | @csv'" \; | sed 's/\x00//g' > data4impact_corpus_allmedia.csv
3) Recreate the table in the DB using
DROP TABLE socialdata;
DROP SEQUENCE socialdata_serial;
CREATE SEQUENCE socialdata_serial START 1;
CREATE TABLE socialdata (
id text PRIMARY KEY DEFAULT '51|social__data::'||MD5(nextval('socialdata_serial')::text),
content text,
actor text,
topicId text,
mediatype text,
source text,
headline text,
url text,
dt text,
language text,
country text
);
4) Insert data:
COPY socialdata(content,actor,topicId,mediatype,source,headline,url,dt,language,country) FROM '/data/ftp/d4i/social_data/news-blogs-forum/data4impact_corpus_allmedia.csv' CSV;
(OPTIONAL) if (error_during_copy) -> perl -pi -e 's/\x00//g' data4impact_corpus_allmedia.csv
#############################################################################################################################################################
# MANUAL STEPS FOR twitter (Buzz)
1) cd "/data/ftp/d4i/social_data/twitter/Buzz JSON Feb"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.rtid,.rtDate,.date,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter_buzz.csv
3) recreate the table
DROP TABLE twitterbuzz;
DROP SEQUENCE twitterbuzz_serial;
CREATE SEQUENCE twitterbuzz_serial START 1;
CREATE TABLE twitterbuzz (
id text PRIMARY KEY DEFAULT '52|twitter_buzz::'||MD5(nextval('twitterbuzz_serial')::text),
tags text,
language text,
country text,
content text,
topicid text,
sourcetype text,
source text,
actor text,
rtid text,
rtdate text,
date text,
headline text
);
CREATE TABLE twitterbuzz_tags(
tb_id text REFERENCES twitterbuzz(id),
tag text,
PRIMARY KEY(tb_id, tag)
);
4) Insert data:
COPY twitterbuzz(tags,language,country,content,topicid,sourcetype,source,actor,rtid,rtdate,date,headline) FROM '/data/ftp/d4i/social_data/twitter/Buzz JSON Feb/twitter_buzz.csv' CSV;
5) patch data:
UPDATE twitterbuzz SET tags = '' WHERE tags = 'null';
UPDATE twitterbuzz SET tags = replace(tags, ',,', ',') WHERE tags LIKE '%,,%';
insert into twitterbuzz_tags(tb_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitterbuzz) as t where tag != '';
alter table twitterbuzz drop column tags;
#############################################################################################################################################################
# MANUAL STEPS FOR twitter
1) cd "/data/ftp/d4i/social_data/twitter/Corrected JSON"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.retweetedActor,(.urls | join("§")),.datetime,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter.csv
DROP TABLE twitter;
DROP SEQUENCE twitter_serial;
CREATE SEQUENCE twitter_serial START 1;
CREATE TABLE twitter (
id text PRIMARY KEY DEFAULT '52|twitter_____::'||MD5(nextval('twitter_serial')::text),
tags text,
language text,
country text,
content text,
topicid text,
sourcetype text,
source text,
actor text,
retweetedactor text,
urls text,
datetime timestamp,
headline text
);
CREATE TABLE twitter_tags(
t_id text REFERENCES twitter(id),
tag text,
PRIMARY KEY(t_id, tag)
);
CREATE TABLE twitter_urls(
t_id text REFERENCES twitter(id),
url text,
PRIMARY KEY(t_id, url)
);
4) Insert data:
COPY twitter(tags,language,country,content,topicid,sourcetype,source,actor,retweetedactor,urls,datetime,headline) FROM '/data/ftp/d4i/social_data/twitter/Corrected JSON/twitter.csv' CSV;
5) patch data:
insert into twitter_tags(t_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitter) as t where tag != '' and tag != 'null';
insert into twitter_urls(t_id, url) select distinct * from (select id, regexp_split_to_table(urls, '§') as url from twitter) as t where url != '' and url != 'null';
alter table twitter drop column tags;
alter table twitter drop column urls;
#############################################################################################################################################################
# MANUAL STEPS FOR twitter_threads
1) cd "/data/ftp/d4i/social_data/twitter/Threads"
2)
jq -r 'map([.threadId, .length, .velocity, .participants, .startId, .startTime, .endTime]) | .[] | @csv' twitter_threads_metadata.json > twitter_threads_metadata.csv
rm twitter_threads_tweets.csv
# jq-1.6 is required
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_doi.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q1.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q2.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q3.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q4.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q5.json >> twitter_threads_tweets.csv
3)
DROP TABLE IF EXISTS twitter_threads
CREATE TABLE twitter_threads (
id text PRIMARY KEY,
length int,
velocity double precision,
participants int,
startid int,
starttime timestamp,
endtime timestamp
);
CREATE TABLE twitter_threads_tweets(
threadid text REFERENCES twitter_threads(id),
tweetid int,
fromuser text,
touser text,
inreplyto int,
datetime timestamp,
content text,
quotedstatus text,
mentions text,
urls text,
PRIMARY KEY (threadid, tweetid)
);
CREATE TABLE twitter_threads_tweets_mentions(
threadid text,
tweetid int,
mention text,
PRIMARY KEY (threadid, tweetid, mention),
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);
CREATE TABLE twitter_threads_tweets_urls(
threadid text,
tweetid int,
url text,
PRIMARY KEY (threadid, tweetid, url),
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);
4) Insert data:
COPY twitter_threads(id, length, velocity, participants, startid, starttime, endtime) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_metadata.csv' CSV;
COPY twitter_threads_tweets(threadid, tweetid, fromuser, touser, inreplyto, datetime, content, quotedstatus, mentions, urls) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_tweets.csv' CSV;
5) patch data:
insert into twitter_threads_tweets_mentions(threadid, tweetid, mention) select distinct * from (select threadid, tweetid, regexp_split_to_table(mentions, '§') as mention from twitter_threads_tweets) as t where mention != '' and mention != 'null';
insert into twitter_threads_tweets_urls (threadid, tweetid, url) select distinct * from (select threadid, tweetid, regexp_split_to_table(urls, '§') as url from twitter_threads_tweets) as t where url != '' and url != 'null';
alter table twitter_threads_tweets drop column mentions;
alter table twitter_threads_tweets drop column urls;