| | |
| | """Twitter_API.ipynb |
| | |
| | Automatically generated by Colaboratory. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1UAilj_PmxYbwHsc_s79d9UyBvawBVZAS |
| | |
| | # Tweet mining using Twitter API via Tweepy: |
| | |
| | In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv. |
| | """ |
| |
|
| | from google.colab import drive |
| | drive.mount('/content/drive') |
| |
|
| | """## Tweets mining""" |
| |
|
| | !pip install -qqq tweepy |
| |
|
| | |
| | import tweepy |
| | from tweepy.streaming import StreamListener |
| | from tweepy import OAuthHandler |
| | from tweepy import Stream |
| | import csv |
| | import pandas as pd |
| |
|
| | |
| | |
| |
|
| | |
| | consumer_key = config.API_KEY |
| | consumer_secret = config.API_KEY_SECRET |
| | access_key= config.ACCESS_TOKEN |
| | access_secret = config.ACCESS_TOKEN_SECRET |
| |
|
| | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) |
| | auth.set_access_token(access_key, access_secret) |
| | api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) |
| |
|
| | |
| | |
| |
|
| | """## "#depressed"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining1(search_query1, num_tweets1, since_id_num1): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang="en", since_id=since_id_num1, |
| | tweet_mode='extended').items(num_tweets1)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list1[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1: |
| | csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object |
| | csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words1 = " |
| | |
| | search_query1 = search_words1 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining1(search_query1, 1000, latest_tweet) |
| |
|
| | df_depressed_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_depressed_1 |
| |
|
| | |
| | for col in df_depressed_1: |
| | print("There are ", len(df_depressed_1[col].unique()), "unique values in ", col) |
| |
|
| | """### Anxiety and suicide """ |
| |
|
| | |
| | def tweets_mining2(search_query2, num_tweets2, since_id_num2): |
| | |
| | |
| | tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang="en", since_id=since_id_num2, |
| | tweet_mode='extended').items(num_tweets2)] |
| | |
| | |
| | for tweet in tweet_list2[::-1]: |
| | tweet_id = tweet.id |
| | created_at = tweet.created_at |
| | text = tweet.full_text |
| | location = tweet.user.location |
| | retweet = tweet.retweet_count |
| | favorite = tweet.favorite_count |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2: |
| | csv_writer2 = csv.writer(csvFile2, delimiter=',') |
| | csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) |
| |
|
| | search_words2 = "#anxiety" |
| | |
| | search_query2 = search_words2 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining2(search_query2, 2000, latest_tweet) |
| |
|
| | df_anxiety_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_anxiety_1 |
| |
|
| | |
| | for col in df_anxiety_1: |
| | print("There are ", len(df_anxiety_1[col].unique()), "unique values in ", col) |
| |
|
| | """## "#Suicide"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining3(search_query3, num_tweets3, since_id_num3): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang="en", since_id=since_id_num3, |
| | tweet_mode='extended').items(num_tweets3)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list3[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3: |
| | csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object |
| | csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words3 = " |
| | |
| | search_query3 = search_words3 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining3(search_query3, 10000, latest_tweet) |
| |
|
| | df_suicide_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_suicide_1 |
| |
|
| | """## "#hopelessness"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining4(search_query4, num_tweets4, since_id_num4): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang="en", since_id=since_id_num4, |
| | tweet_mode='extended').items(num_tweets4)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list4[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4: |
| | csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object |
| | csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words4 = " |
| | |
| | search_query4 = search_words4 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining4(search_query4, 10000, latest_tweet) |
| |
|
| | df_hopeless_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_hopeless_1 |
| |
|
| | """## "#mentalhealth"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining5(search_query5, num_tweets5, since_id_num5): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang="en", since_id=since_id_num5, |
| | tweet_mode='extended').items(num_tweets5)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list5[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5: |
| | csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object |
| | csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words5 = " |
| | |
| | search_query5 = search_words5 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining5(search_query5, 1000, latest_tweet) |
| |
|
| | df_mentalhealth_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_mentalhealth_1 |
| |
|
| | """## "#loneliness"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining6(search_query6, num_tweets6, since_id_num6): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang="en", since_id=since_id_num6, |
| | tweet_mode='extended').items(num_tweets6)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list6[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6: |
| | csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object |
| | csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words6 = " |
| | |
| | search_query6 = search_words6 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining6(search_query6, 10000, latest_tweet) |
| |
|
| | df_loneliness_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_loneliness_1 |
| |
|
| | """## "#itsokaynottobeokay"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining7(search_query7, num_tweets7, since_id_num7): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang="en", since_id=since_id_num7, |
| | tweet_mode='extended').items(num_tweets7)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list7[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7: |
| | csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object |
| | csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words7 = " |
| | |
| | search_query7 = search_words7 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining7(search_query7, 2000, latest_tweet) |
| |
|
| | df_itsok_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_itsok_1 |
| |
|
| | """## "#depression"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining8(search_query8, num_tweets8, since_id_num8): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang="en", since_id=since_id_num8, |
| | tweet_mode='extended').items(num_tweets8)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list8[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8: |
| | csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object |
| | csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words8 = " |
| | |
| | search_query8 = search_words8 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining8(search_query8, 1000, latest_tweet) |
| |
|
| | df_depression_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_depression_1 |
| |
|
| | |
| | for col in df_depression_1: |
| | print("There are ", len(df_depression_1[col].unique()), "unique values in ", col) |
| |
|
| | """## "#sad"""" |
| | |
| | ## Create a function for tweets mining |
| | def tweets_mining9(search_query9, num_tweets9, since_id_num9): |
| | # Collect tweets using the Cursor object |
| | # Each item in the iterator has various attributes that you can access to get information about each tweet |
| | tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang="en", since_id=since_id_num9, |
| | tweet_mode='extended').items(num_tweets9)] |
| | |
| | # Begin scraping the tweets individually: |
| | for tweet in tweet_list9[::-1]: |
| | tweet_id = tweet.id # get Tweet ID result |
| | created_at = tweet.created_at # get time tweet was created |
| | text = tweet.full_text # retrieve full tweet text |
| | location = tweet.user.location # retrieve user location |
| | retweet = tweet.retweet_count # retrieve number of retweets |
| | favorite = tweet.favorite_count # retrieve number of likes |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9: |
| | csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object |
| | csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row |
| | |
| | search_words9 = " |
| | |
| | search_query9 = search_words9 + " -filter:links AND -filter:retweets AND -filter:replies" |
| | with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data: |
| | latest_tweet = int(list(csv.reader(data))[-1][0]) |
| | tweets_mining9(search_query9, 2000, latest_tweet) |
| |
|
| | df_sad_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv", |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| |
|
| | df_sad_1 |
| |
|
| | """# Combining all the tweets""" |
| |
|
| | import glob |
| |
|
| | path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' |
| | all_files = glob.glob(path + "/*.csv") |
| |
|
| | tweets = [] |
| |
|
| | for filename in all_files: |
| | df = pd.read_csv(filename, |
| | names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) |
| | tweets.append(df) |
| |
|
| | tweets_df = pd.concat(tweets, ignore_index=True) |
| | |
| | tweets_df.head() |
| |
|
| | tweets_df |
| |
|
| | tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv') |
| |
|
| | """## Data cleaning |
| | |
| | Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any. |
| | """ |
| |
|
| | tweets_df.shape |
| |
|
| | |
| | tweets_df.dtypes.to_frame().rename(columns={0:'data_type'}) |
| |
|
| | |
| | for col in tweets_df: |
| | print("There are ", len(tweets_df[col].unique()), "unique values in ", col) |