data4impact/apps/data4impact-import-scripts/scripts/news-blogs-forum-twitter/script.sh

# MANUAL STEPS FOR news-blogs-forum

1) cd /data/ftp/d4i/social_data/news-blogs-forum

2) find *.zip -exec bash -c "unzip -p {} | jq --slurp -r 'map([.content,.actor,.topicId,.mediatype,.source,.headline,.url,.dt,.language,.country]) | .[] | @csv'" \; | sed 's/\x00//g' > data4impact_corpus_allmedia.csv

3) Recreate the table in the DB using

DROP TABLE socialdata;
DROP SEQUENCE socialdata_serial;

CREATE SEQUENCE socialdata_serial START 1;

CREATE TABLE socialdata (
	id        text PRIMARY KEY DEFAULT '51|social__data::'||MD5(nextval('socialdata_serial')::text),
	content   text,
	actor     text,
	topicId   text,
	mediatype text,
	source    text,
	headline  text,
	url       text,
	dt        text,
	language  text,
	country   text
);

4) Insert data:

COPY socialdata(content,actor,topicId,mediatype,source,headline,url,dt,language,country) FROM '/data/ftp/d4i/social_data/news-blogs-forum/data4impact_corpus_allmedia.csv' CSV;

(OPTIONAL) if (error_during_copy)  ->   perl -pi -e 's/\x00//g' data4impact_corpus_allmedia.csv

#############################################################################################################################################################

# MANUAL STEPS FOR twitter (Buzz)

1) cd "/data/ftp/d4i/social_data/twitter/Buzz JSON Feb"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.rtid,.rtDate,.date,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter_buzz.csv
3) recreate the table

DROP TABLE twitterbuzz;
DROP SEQUENCE twitterbuzz_serial;

CREATE SEQUENCE twitterbuzz_serial START 1;

CREATE TABLE twitterbuzz (
	id         text PRIMARY KEY DEFAULT '52|twitter_buzz::'||MD5(nextval('twitterbuzz_serial')::text),
	tags       text,
	language   text,
	country    text,
	content    text,
	topicid    text,
	sourcetype text,
	source     text,
	actor      text,
	rtid       text,
	rtdate     text,
	date       text,
	headline   text
);

CREATE TABLE twitterbuzz_tags(
	tb_id text REFERENCES twitterbuzz(id),
	tag   text,
	PRIMARY KEY(tb_id, tag)
);

4) Insert data:

COPY twitterbuzz(tags,language,country,content,topicid,sourcetype,source,actor,rtid,rtdate,date,headline) FROM '/data/ftp/d4i/social_data/twitter/Buzz JSON Feb/twitter_buzz.csv' CSV;

5) patch data:
UPDATE twitterbuzz SET tags = '' WHERE tags = 'null';
UPDATE twitterbuzz SET tags = replace(tags, ',,', ',') WHERE tags LIKE '%,,%';

insert into twitterbuzz_tags(tb_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitterbuzz) as t where tag != '';
alter table twitterbuzz drop column tags;


#############################################################################################################################################################

# MANUAL STEPS FOR twitter

1) cd "/data/ftp/d4i/social_data/twitter/Corrected JSON"
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.retweetedActor,(.urls | join("§")),.datetime,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter.csv

DROP TABLE twitter;
DROP SEQUENCE twitter_serial;

CREATE SEQUENCE twitter_serial START 1;


CREATE TABLE twitter (
	id         text PRIMARY KEY DEFAULT '52|twitter_____::'||MD5(nextval('twitter_serial')::text),
	tags       text,
	language   text,
	country    text,
	content    text,
	topicid    text,
	sourcetype text,
	source     text,
	actor      text,
	retweetedactor text,
	urls       text,
	datetime   timestamp,
	headline   text
);

CREATE TABLE twitter_tags(
	t_id text REFERENCES twitter(id),
	tag   text,
	PRIMARY KEY(t_id, tag)
);

CREATE TABLE twitter_urls(
	t_id text REFERENCES twitter(id),
	url   text,
	PRIMARY KEY(t_id, url)
);


4) Insert data:

COPY twitter(tags,language,country,content,topicid,sourcetype,source,actor,retweetedactor,urls,datetime,headline) FROM '/data/ftp/d4i/social_data/twitter/Corrected JSON/twitter.csv' CSV;

5) patch data:
insert into twitter_tags(t_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitter) as t where tag != '' and tag != 'null';
insert into twitter_urls(t_id, url) select distinct * from (select id, regexp_split_to_table(urls, '§') as url from twitter) as t where url != '' and url != 'null';
alter table twitter drop column tags;
alter table twitter drop column urls;


#############################################################################################################################################################

# MANUAL STEPS FOR twitter_threads

1) cd "/data/ftp/d4i/social_data/twitter/Threads"
2)

jq -r 'map([.threadId, .length, .velocity, .participants, .startId, .startTime, .endTime]) | .[] | @csv' twitter_threads_metadata.json > twitter_threads_metadata.csv
rm twitter_threads_tweets.csv

# jq-1.6 is required
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_doi.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q1.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q2.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q3.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q4.json >> twitter_threads_tweets.csv
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q5.json >> twitter_threads_tweets.csv

3)

DROP TABLE IF EXISTS twitter_threads
CREATE TABLE twitter_threads (
	id           text PRIMARY KEY,
	length       int,
	velocity     double precision,
	participants int,
	startid      int,
	starttime    timestamp,
	endtime      timestamp
);

CREATE TABLE twitter_threads_tweets(
	threadid     text REFERENCES twitter_threads(id),
	tweetid      int,
	fromuser     text,
	touser       text,
	inreplyto    int,
	datetime     timestamp,
	content      text,
	quotedstatus text,
	mentions     text,
	urls         text,
	PRIMARY KEY (threadid, tweetid)
);

CREATE TABLE twitter_threads_tweets_mentions(
	threadid     text,
	tweetid      int,
	mention      text,
	PRIMARY KEY (threadid, tweetid, mention),
	FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);

CREATE TABLE twitter_threads_tweets_urls(
	threadid text,
	tweetid  int,
	url      text,
	PRIMARY KEY (threadid, tweetid, url),
	FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
);


4) Insert data:

COPY twitter_threads(id, length, velocity, participants, startid, starttime, endtime) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_metadata.csv' CSV;
COPY twitter_threads_tweets(threadid, tweetid, fromuser, touser, inreplyto, datetime, content, quotedstatus, mentions, urls) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_tweets.csv' CSV;


5) patch data:

insert into twitter_threads_tweets_mentions(threadid, tweetid, mention) select distinct * from (select threadid, tweetid, regexp_split_to_table(mentions, '§') as mention from twitter_threads_tweets) as t where mention != '' and mention != 'null';
insert into twitter_threads_tweets_urls    (threadid, tweetid, url)     select distinct * from (select threadid, tweetid, regexp_split_to_table(urls,     '§') as url     from twitter_threads_tweets) as t where url     != '' and url     != 'null';
alter table twitter_threads_tweets drop column mentions;
alter table twitter_threads_tweets drop column urls;