212 lines
8.2 KiB
Bash
212 lines
8.2 KiB
Bash
# MANUAL STEPS FOR news-blogs-forum
|
|
|
|
1) cd /data/ftp/d4i/social_data/news-blogs-forum
|
|
|
|
2) find *.zip -exec bash -c "unzip -p {} | jq --slurp -r 'map([.content,.actor,.topicId,.mediatype,.source,.headline,.url,.dt,.language,.country]) | .[] | @csv'" \; | sed 's/\x00//g' > data4impact_corpus_allmedia.csv
|
|
|
|
3) Recreate the table in the DB using
|
|
|
|
DROP TABLE socialdata;
|
|
DROP SEQUENCE socialdata_serial;
|
|
|
|
CREATE SEQUENCE socialdata_serial START 1;
|
|
|
|
CREATE TABLE socialdata (
|
|
id text PRIMARY KEY DEFAULT '51|social__data::'||MD5(nextval('socialdata_serial')::text),
|
|
content text,
|
|
actor text,
|
|
topicId text,
|
|
mediatype text,
|
|
source text,
|
|
headline text,
|
|
url text,
|
|
dt text,
|
|
language text,
|
|
country text
|
|
);
|
|
|
|
4) Insert data:
|
|
|
|
COPY socialdata(content,actor,topicId,mediatype,source,headline,url,dt,language,country) FROM '/data/ftp/d4i/social_data/news-blogs-forum/data4impact_corpus_allmedia.csv' CSV;
|
|
|
|
(OPTIONAL) if (error_during_copy) -> perl -pi -e 's/\x00//g' data4impact_corpus_allmedia.csv
|
|
|
|
#############################################################################################################################################################
|
|
|
|
# MANUAL STEPS FOR twitter (Buzz)
|
|
|
|
1) cd "/data/ftp/d4i/social_data/twitter/Buzz JSON Feb"
|
|
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.rtid,.rtDate,.date,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter_buzz.csv
|
|
3) recreate the table
|
|
|
|
DROP TABLE twitterbuzz;
|
|
DROP SEQUENCE twitterbuzz_serial;
|
|
|
|
CREATE SEQUENCE twitterbuzz_serial START 1;
|
|
|
|
CREATE TABLE twitterbuzz (
|
|
id text PRIMARY KEY DEFAULT '52|twitter_buzz::'||MD5(nextval('twitterbuzz_serial')::text),
|
|
tags text,
|
|
language text,
|
|
country text,
|
|
content text,
|
|
topicid text,
|
|
sourcetype text,
|
|
source text,
|
|
actor text,
|
|
rtid text,
|
|
rtdate text,
|
|
date text,
|
|
headline text
|
|
);
|
|
|
|
CREATE TABLE twitterbuzz_tags(
|
|
tb_id text REFERENCES twitterbuzz(id),
|
|
tag text,
|
|
PRIMARY KEY(tb_id, tag)
|
|
);
|
|
|
|
4) Insert data:
|
|
|
|
COPY twitterbuzz(tags,language,country,content,topicid,sourcetype,source,actor,rtid,rtdate,date,headline) FROM '/data/ftp/d4i/social_data/twitter/Buzz JSON Feb/twitter_buzz.csv' CSV;
|
|
|
|
5) patch data:
|
|
UPDATE twitterbuzz SET tags = '' WHERE tags = 'null';
|
|
UPDATE twitterbuzz SET tags = replace(tags, ',,', ',') WHERE tags LIKE '%,,%';
|
|
|
|
insert into twitterbuzz_tags(tb_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitterbuzz) as t where tag != '';
|
|
alter table twitterbuzz drop column tags;
|
|
|
|
|
|
#############################################################################################################################################################
|
|
|
|
# MANUAL STEPS FOR twitter
|
|
|
|
1) cd "/data/ftp/d4i/social_data/twitter/Corrected JSON"
|
|
2) find *.json -exec jq -r 'def join(sep): sep as $sep | reduce .[1:][] as $item (.[0]|tostring; . + $sep + $item); map ([(.tags | join(",")),.language,.country,.content,.topicId,.sourceType,.source,.actor,.retweetedActor,(.urls | join("§")),.datetime,.headline]) | .[] | @csv' {} \; | sed 's/\x00//g' > twitter.csv
|
|
|
|
DROP TABLE twitter;
|
|
DROP SEQUENCE twitter_serial;
|
|
|
|
CREATE SEQUENCE twitter_serial START 1;
|
|
|
|
|
|
CREATE TABLE twitter (
|
|
id text PRIMARY KEY DEFAULT '52|twitter_____::'||MD5(nextval('twitter_serial')::text),
|
|
tags text,
|
|
language text,
|
|
country text,
|
|
content text,
|
|
topicid text,
|
|
sourcetype text,
|
|
source text,
|
|
actor text,
|
|
retweetedactor text,
|
|
urls text,
|
|
datetime timestamp,
|
|
headline text
|
|
);
|
|
|
|
CREATE TABLE twitter_tags(
|
|
t_id text REFERENCES twitter(id),
|
|
tag text,
|
|
PRIMARY KEY(t_id, tag)
|
|
);
|
|
|
|
CREATE TABLE twitter_urls(
|
|
t_id text REFERENCES twitter(id),
|
|
url text,
|
|
PRIMARY KEY(t_id, url)
|
|
);
|
|
|
|
|
|
|
|
4) Insert data:
|
|
|
|
COPY twitter(tags,language,country,content,topicid,sourcetype,source,actor,retweetedactor,urls,datetime,headline) FROM '/data/ftp/d4i/social_data/twitter/Corrected JSON/twitter.csv' CSV;
|
|
|
|
5) patch data:
|
|
insert into twitter_tags(t_id, tag) select distinct * from (select id, regexp_split_to_table(tags, ',') as tag from twitter) as t where tag != '' and tag != 'null';
|
|
insert into twitter_urls(t_id, url) select distinct * from (select id, regexp_split_to_table(urls, '§') as url from twitter) as t where url != '' and url != 'null';
|
|
alter table twitter drop column tags;
|
|
alter table twitter drop column urls;
|
|
|
|
|
|
|
|
#############################################################################################################################################################
|
|
|
|
# MANUAL STEPS FOR twitter_threads
|
|
|
|
1) cd "/data/ftp/d4i/social_data/twitter/Threads"
|
|
2)
|
|
|
|
jq -r 'map([.threadId, .length, .velocity, .participants, .startId, .startTime, .endTime]) | .[] | @csv' twitter_threads_metadata.json > twitter_threads_metadata.csv
|
|
rm twitter_threads_tweets.csv
|
|
|
|
# jq-1.6 is required
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_doi.json >> twitter_threads_tweets.csv
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q1.json >> twitter_threads_tweets.csv
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q2.json >> twitter_threads_tweets.csv
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q3.json >> twitter_threads_tweets.csv
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q4.json >> twitter_threads_tweets.csv
|
|
jq -r 'map([.threadId, .tweetId, .fromUser, .toUser, .inReplyTo, .dateTime, .content, .quotedStatus, (.mentions|join("§")), (.urls|join("§"))]) | .[] | @csv' twitter_threads_q5.json >> twitter_threads_tweets.csv
|
|
|
|
3)
|
|
|
|
DROP TABLE IF EXISTS twitter_threads
|
|
CREATE TABLE twitter_threads (
|
|
id text PRIMARY KEY,
|
|
length int,
|
|
velocity double precision,
|
|
participants int,
|
|
startid int,
|
|
starttime timestamp,
|
|
endtime timestamp
|
|
);
|
|
|
|
CREATE TABLE twitter_threads_tweets(
|
|
threadid text REFERENCES twitter_threads(id),
|
|
tweetid int,
|
|
fromuser text,
|
|
touser text,
|
|
inreplyto int,
|
|
datetime timestamp,
|
|
content text,
|
|
quotedstatus text,
|
|
mentions text,
|
|
urls text,
|
|
PRIMARY KEY (threadid, tweetid)
|
|
);
|
|
|
|
CREATE TABLE twitter_threads_tweets_mentions(
|
|
threadid text,
|
|
tweetid int,
|
|
mention text,
|
|
PRIMARY KEY (threadid, tweetid, mention),
|
|
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
|
|
);
|
|
|
|
CREATE TABLE twitter_threads_tweets_urls(
|
|
threadid text,
|
|
tweetid int,
|
|
url text,
|
|
PRIMARY KEY (threadid, tweetid, url),
|
|
FOREIGN KEY (threadid, tweetid) REFERENCES twitter_threads_tweets(threadid, tweetid)
|
|
);
|
|
|
|
|
|
|
|
4) Insert data:
|
|
|
|
COPY twitter_threads(id, length, velocity, participants, startid, starttime, endtime) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_metadata.csv' CSV;
|
|
COPY twitter_threads_tweets(threadid, tweetid, fromuser, touser, inreplyto, datetime, content, quotedstatus, mentions, urls) FROM '/data/ftp/d4i/social_data/twitter/Threads/twitter_threads_tweets.csv' CSV;
|
|
|
|
|
|
5) patch data:
|
|
|
|
insert into twitter_threads_tweets_mentions(threadid, tweetid, mention) select distinct * from (select threadid, tweetid, regexp_split_to_table(mentions, '§') as mention from twitter_threads_tweets) as t where mention != '' and mention != 'null';
|
|
insert into twitter_threads_tweets_urls (threadid, tweetid, url) select distinct * from (select threadid, tweetid, regexp_split_to_table(urls, '§') as url from twitter_threads_tweets) as t where url != '' and url != 'null';
|
|
alter table twitter_threads_tweets drop column mentions;
|
|
alter table twitter_threads_tweets drop column urls;
|
|
|