{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Twitter_API.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "n9mFOtjUGKmk"
},
"source": [
"# Tweet mining using Twitter API via Tweepy:"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-3bUQ54_84g8"
},
"source": [
"In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1Bojm_bffNAV",
"outputId": "92f04f31-eb1b-4c13-f811-1cad9d759a34"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7iWDBsjTwEyZ"
},
"source": [
"## Tweets mining"
]
},
{
"cell_type": "code",
"metadata": {
"id": "TtZk0vyLwWwW"
},
"source": [
"!pip install -qqq tweepy"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jobjTBDIwhUl"
},
"source": [
"## Import required libraries\n",
"import tweepy\n",
"from tweepy.streaming import StreamListener\n",
"from tweepy import OAuthHandler\n",
"from tweepy import Stream\n",
"import csv\n",
"import pandas as pd\n",
"\n",
"## Access to twitter API cunsumer_key and access_secret\n",
"#import config.ipynb"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Dv5AsxY6iL2s"
},
"source": [
"## Twitter API related information\n",
"consumer_key = config.API_KEY\n",
"consumer_secret = config.API_KEY_SECRET\n",
"access_key= config.ACCESS_TOKEN\n",
"access_secret = config.ACCESS_TOKEN_SECRET"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "M6mSp-B_vzn-"
},
"source": [
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API\n",
"auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API\n",
"api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached"
],
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FHqBQHYpDcz_"
},
"source": [
"## depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n",
"## \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\", \"#sad\"]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0-BvNrToRims"
},
"source": [
"## \"#depressed\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "BERTal4NwVNx"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining1(search_query1, num_tweets1, since_id_num1):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang=\"en\", since_id=since_id_num1, \n",
" tweet_mode='extended').items(num_tweets1)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list1[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:\n",
" csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object\n",
" csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8LOXgG5xygnj"
},
"source": [
"search_words1 = \"#depressed\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query1 = search_words1 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining1(search_query1, 1000, latest_tweet)"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "JSDTPj7Nz5Rh"
},
"source": [
"df_depressed_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "aQe7bso7VBZA",
"outputId": "bed5b299-8399-4b86-f6d6-630085f308a8"
},
"source": [
"df_depressed_1"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1446882366945837057 | \n",
" 2021-10-09 16:56:52 | \n",
" I totally need someone to hug me TIGHT and say... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1446896799860539394 | \n",
" 2021-10-09 17:54:13 | \n",
" i plan on committing suicide today or tommorro... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1446912210672959491 | \n",
" 2021-10-09 18:55:28 | \n",
" Exhausted! Absolutely exhausted and my day isn... | \n",
" Lost 🤕 | \n",
" 0 | \n",
" 8 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1446931930537209856 | \n",
" 2021-10-09 20:13:49 | \n",
" Im going to get Far Cry 6 and playing video ga... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1446934914453082113 | \n",
" 2021-10-09 20:25:41 | \n",
" Just #depressed haven’t made money in 4 days o... | \n",
" Daddy’s lap. | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 1440 | \n",
" 1459292661848883203 | \n",
" 2021-11-12 22:50:57 | \n",
" it gets dark at 5 now. #depressed | \n",
" Toronto, Ontario | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 1441 | \n",
" 1459295472993153030 | \n",
" 2021-11-12 23:02:07 | \n",
" Ignore my tweets, if I tweet, for the next cou... | \n",
" Paisley, Scotland | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1442 | \n",
" 1459323510803759108 | \n",
" 2021-11-13 00:53:32 | \n",
" how tf you a psychology major and depressed? l... | \n",
" San Diego, CA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1443 | \n",
" 1459376207527440385 | \n",
" 2021-11-13 04:22:56 | \n",
" Liquors my bestie till my flight tomorrow fml ... | \n",
" Dreamville, LBC♥ | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1444 | \n",
" 1459497253698035714 | \n",
" 2021-11-13 12:23:56 | \n",
" i signed up for @netflix just so i can watch b... | \n",
" Washington, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
1445 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1446882366945837057 2021-10-09 16:56:52 ... 0 1\n",
"1 1446896799860539394 2021-10-09 17:54:13 ... 0 1\n",
"2 1446912210672959491 2021-10-09 18:55:28 ... 0 8\n",
"3 1446931930537209856 2021-10-09 20:13:49 ... 0 1\n",
"4 1446934914453082113 2021-10-09 20:25:41 ... 0 2\n",
"... ... ... ... ... ...\n",
"1440 1459292661848883203 2021-11-12 22:50:57 ... 0 2\n",
"1441 1459295472993153030 2021-11-12 23:02:07 ... 0 1\n",
"1442 1459323510803759108 2021-11-13 00:53:32 ... 0 0\n",
"1443 1459376207527440385 2021-11-13 04:22:56 ... 0 0\n",
"1444 1459497253698035714 2021-11-13 12:23:56 ... 0 0\n",
"\n",
"[1445 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gnZnQBdQ8VZL",
"outputId": "2dc93be1-17f9-4b9d-d1d5-cab5eafdb544"
},
"source": [
"## Finding unique values in each column\n",
"for col in df_depressed_1:\n",
" print(\"There are \", len(df_depressed_1[col].unique()), \"unique values in \", col)"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"There are 849 unique values in tweet.id\n",
"There are 849 unique values in created_at\n",
"There are 843 unique values in text\n",
"There are 383 unique values in location\n",
"There are 7 unique values in retweet\n",
"There are 25 unique values in favorite\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jVSywSxSvYbS"
},
"source": [
"### Anxiety and suicide "
]
},
{
"cell_type": "code",
"metadata": {
"id": "1UWM-o41vd6Z"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining2(search_query2, num_tweets2, since_id_num2):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang=\"en\", since_id=since_id_num2, \n",
" tweet_mode='extended').items(num_tweets2)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list2[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:\n",
" csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object\n",
" csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4WS3HYJ_yUPe"
},
"source": [
"search_words2 = \"#anxiety\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query2 = search_words2 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining2(search_query2, 2000, latest_tweet)"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "mMnPf-UoD1gA"
},
"source": [
"df_anxiety_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "SyvsN8-3D73N",
"outputId": "d139df05-638a-4a91-e94c-e7560db53069"
},
"source": [
"df_anxiety_1"
],
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447067749654614019 | \n",
" 2021-10-10 05:13:31 | \n",
" I can't wait to get the hell out. so I'll jus... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447069714379857927 | \n",
" 2021-10-10 05:21:19 | \n",
" Morning. All people except me sleeping. @Billy... | \n",
" Queenie's Castle,Yate, S Glos | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447072203388985346 | \n",
" 2021-10-10 05:31:13 | \n",
" On #WorldMentalHealthDay, a big shoutout to my... | \n",
" Bengaluru/Muscat/Palakad/Kochi | \n",
" 0 | \n",
" 9 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447072334825754626 | \n",
" 2021-10-10 05:31:44 | \n",
" I hate having anxiety about doing stuff that I... | \n",
" Utah, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447074986531848192 | \n",
" 2021-10-10 05:42:16 | \n",
" I am not scared of my ADHD, depression and anx... | \n",
" Wollongong, New South Wales | \n",
" 2 | \n",
" 11 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 6867 | \n",
" 1459224031777939460 | \n",
" 2021-11-12 18:18:14 | \n",
" It’s amazing how everyone runs to me as the su... | \n",
" Pennsylvania, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6868 | \n",
" 1459224808512704516 | \n",
" 2021-11-12 18:21:20 | \n",
" Any suggestions on settling the stomach after ... | \n",
" Everywhere, Anywhere | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6869 | \n",
" 1459228047278751747 | \n",
" 2021-11-12 18:34:12 | \n",
" Gotta love that superpowered #anxiety taking h... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6870 | \n",
" 1459229518128893952 | \n",
" 2021-11-12 18:40:02 | \n",
" Growth nor healing is linear. Sometimes you ma... | \n",
" London | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6871 | \n",
" 1459230527358222337 | \n",
" 2021-11-12 18:44:03 | \n",
" Just read on a YouTube comment how mentally il... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
6872 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447067749654614019 2021-10-10 05:13:31 ... 0 0\n",
"1 1447069714379857927 2021-10-10 05:21:19 ... 0 1\n",
"2 1447072203388985346 2021-10-10 05:31:13 ... 0 9\n",
"3 1447072334825754626 2021-10-10 05:31:44 ... 0 0\n",
"4 1447074986531848192 2021-10-10 05:42:16 ... 2 11\n",
"... ... ... ... ... ...\n",
"6867 1459224031777939460 2021-11-12 18:18:14 ... 0 0\n",
"6868 1459224808512704516 2021-11-12 18:21:20 ... 0 0\n",
"6869 1459228047278751747 2021-11-12 18:34:12 ... 0 0\n",
"6870 1459229518128893952 2021-11-12 18:40:02 ... 0 0\n",
"6871 1459230527358222337 2021-11-12 18:44:03 ... 0 0\n",
"\n",
"[6872 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ygvC0l-C9NXp",
"outputId": "cef49691-326a-43d4-a7d5-28725aafc5b5"
},
"source": [
"## Finding unique values in each column\n",
"for col in df_anxiety_1:\n",
" print(\"There are \", len(df_anxiety_1[col].unique()), \"unique values in \", col)"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"There are 4738 unique values in tweet.id\n",
"There are 4733 unique values in created_at\n",
"There are 4342 unique values in text\n",
"There are 1381 unique values in location\n",
"There are 33 unique values in retweet\n",
"There are 80 unique values in favorite\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iSbEvJo0CVBh"
},
"source": [
"## \"#Suicide\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "ofqzhBcR1bj-"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining3(search_query3, num_tweets3, since_id_num3):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang=\"en\", since_id=since_id_num3, \n",
" tweet_mode='extended').items(num_tweets3)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list3[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:\n",
" csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object\n",
" csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_wIXzt57Cn3e"
},
"source": [
"search_words3 = \"#suicide\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query3 = search_words3 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining3(search_query3, 10000, latest_tweet)"
],
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XkfhTVodENiy"
},
"source": [
"df_suicide_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "8HAqIISVEXFy",
"outputId": "0667d586-2e25-4690-95b4-a86f748e9eae"
},
"source": [
"df_suicide_1"
],
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447381474034999296 | \n",
" 2021-10-11 02:00:09 | \n",
" #suicide is the strong belief that no matter h... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447439429409415172 | \n",
" 2021-10-11 05:50:26 | \n",
" \"suicide\"\\nHollowness enough\\nSilence enough\\n... | \n",
" NaN | \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447444376464998400 | \n",
" 2021-10-11 06:10:06 | \n",
" Every year passes but the pain remains the sam... | \n",
" India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447445469467131906 | \n",
" 2021-10-11 06:14:26 | \n",
" Have I told you how much I hate my life😂😂😁 #su... | \n",
" Ohio, USA | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447461306295013377 | \n",
" 2021-10-11 07:17:22 | \n",
" The man responsible for the #CDC policies that... | \n",
" United States | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 713 | \n",
" 1459446304577363971 | \n",
" 2021-11-13 09:01:28 | \n",
" Someone wanted me to tell you. You're beautifu... | \n",
" D(1) Florida | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 714 | \n",
" 1459454059975352320 | \n",
" 2021-11-13 09:32:17 | \n",
" It's a regular thing🙂💔\\n#Coimbatore #suicide #... | \n",
" Tiruppur, India | \n",
" 0 | \n",
" 3 | \n",
"
\n",
" \n",
" | 715 | \n",
" 1459454073644765185 | \n",
" 2021-11-13 09:32:21 | \n",
" #Suicide is not as bad as people make it \\n\\nB... | \n",
" The Chisolm Trail | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 716 | \n",
" 1459495548373934081 | \n",
" 2021-11-13 12:17:09 | \n",
" Just Uploaded My Review Of Dear Evan Hansen To... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 717 | \n",
" 1459525084180267027 | \n",
" 2021-11-13 14:14:31 | \n",
" On #WorldKindnessDay we would just like to say... | \n",
" NaN | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
718 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447381474034999296 2021-10-11 02:00:09 ... 0 0\n",
"1 1447439429409415172 2021-10-11 05:50:26 ... 2 2\n",
"2 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
"3 1447445469467131906 2021-10-11 06:14:26 ... 0 1\n",
"4 1447461306295013377 2021-10-11 07:17:22 ... 1 2\n",
".. ... ... ... ... ...\n",
"713 1459446304577363971 2021-11-13 09:01:28 ... 0 0\n",
"714 1459454059975352320 2021-11-13 09:32:17 ... 0 3\n",
"715 1459454073644765185 2021-11-13 09:32:21 ... 0 0\n",
"716 1459495548373934081 2021-11-13 12:17:09 ... 0 0\n",
"717 1459525084180267027 2021-11-13 14:14:31 ... 1 5\n",
"\n",
"[718 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZilsrGx9Ex2i"
},
"source": [
"## \"#hopelessness\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "mqFLOv-AE5Lw"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining4(search_query4, num_tweets4, since_id_num4):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang=\"en\", since_id=since_id_num4, \n",
" tweet_mode='extended').items(num_tweets4)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list4[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:\n",
" csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object\n",
" csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "7Pf9avomE-G6"
},
"source": [
"search_words4 = \"#hopelessness\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query4 = search_words4 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining4(search_query4, 10000, latest_tweet)"
],
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pSauv_5jFAzX"
},
"source": [
"df_hopeless_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "jFjXgpXDFwn1",
"outputId": "a063c672-3333-4e3b-c71e-c3329270854e"
},
"source": [
"df_hopeless_1"
],
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447537898572574730 | \n",
" 2021-10-11 12:21:43 | \n",
" Open discussion. Between the Transfer Portal a... | \n",
" Cheyenne Wyoming | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447540582490988553 | \n",
" 2021-10-11 12:32:23 | \n",
" Plenty of things are changing in my life and t... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447807717859491842 | \n",
" 2021-10-12 06:13:53 | \n",
" I feel a little hopeless. Anyone else? #hopele... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1448076026219692033 | \n",
" 2021-10-13 00:00:03 | \n",
" Which is more healthy? Hope, or hopelessness? ... | \n",
" Denver, CO | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1448382047375040513 | \n",
" 2021-10-13 20:16:04 | \n",
" So someone tell me how do I get over #HOPELESS... | \n",
" Portland Or . | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 5 | \n",
" 1448595145138622464 | \n",
" 2021-10-14 10:22:50 | \n",
" No parent deserves to experience the Indian le... | \n",
" Bombay, Dubai | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | 6 | \n",
" 1448843909841313793 | \n",
" 2021-10-15 02:51:20 | \n",
" Being in a #union also looks a lot like being ... | \n",
" Alberta, Canada | \n",
" 7 | \n",
" 17 | \n",
"
\n",
" \n",
" | 7 | \n",
" 1449848070783524864 | \n",
" 2021-10-17 21:21:31 | \n",
" I am so glad that @GreysABC is tackling the hu... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 8 | \n",
" 1447537898572574730 | \n",
" 2021-10-11 12:21:43 | \n",
" Open discussion. Between the Transfer Portal a... | \n",
" Cheyenne Wyoming | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 9 | \n",
" 1447540582490988553 | \n",
" 2021-10-11 12:32:23 | \n",
" Plenty of things are changing in my life and t... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 10 | \n",
" 1447807717859491842 | \n",
" 2021-10-12 06:13:53 | \n",
" I feel a little hopeless. Anyone else? #hopele... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 11 | \n",
" 1448076026219692033 | \n",
" 2021-10-13 00:00:03 | \n",
" Which is more healthy? Hope, or hopelessness? ... | \n",
" Denver, CO | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 12 | \n",
" 1448382047375040513 | \n",
" 2021-10-13 20:16:04 | \n",
" So someone tell me how do I get over #HOPELESS... | \n",
" Portland Or . | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 13 | \n",
" 1448595145138622464 | \n",
" 2021-10-14 10:22:50 | \n",
" No parent deserves to experience the Indian le... | \n",
" Bombay, Dubai | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | 14 | \n",
" 1448843909841313793 | \n",
" 2021-10-15 02:51:20 | \n",
" Being in a #union also looks a lot like being ... | \n",
" Alberta, Canada | \n",
" 7 | \n",
" 17 | \n",
"
\n",
" \n",
" | 15 | \n",
" 1449848070783524864 | \n",
" 2021-10-17 21:21:31 | \n",
" I am so glad that @GreysABC is tackling the hu... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 16 | \n",
" 1447537898572574730 | \n",
" 2021-10-11 12:21:43 | \n",
" Open discussion. Between the Transfer Portal a... | \n",
" Cheyenne Wyoming | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 17 | \n",
" 1447540582490988553 | \n",
" 2021-10-11 12:32:23 | \n",
" Plenty of things are changing in my life and t... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 18 | \n",
" 1447807717859491842 | \n",
" 2021-10-12 06:13:53 | \n",
" I feel a little hopeless. Anyone else? #hopele... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 19 | \n",
" 1448076026219692033 | \n",
" 2021-10-13 00:00:03 | \n",
" Which is more healthy? Hope, or hopelessness? ... | \n",
" Denver, CO | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 20 | \n",
" 1448382047375040513 | \n",
" 2021-10-13 20:16:04 | \n",
" So someone tell me how do I get over #HOPELESS... | \n",
" Portland Or . | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 21 | \n",
" 1448595145138622464 | \n",
" 2021-10-14 10:22:50 | \n",
" No parent deserves to experience the Indian le... | \n",
" Bombay, Dubai | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | 22 | \n",
" 1448843909841313793 | \n",
" 2021-10-15 02:51:20 | \n",
" Being in a #union also looks a lot like being ... | \n",
" Alberta, Canada | \n",
" 7 | \n",
" 17 | \n",
"
\n",
" \n",
" | 23 | \n",
" 1449848070783524864 | \n",
" 2021-10-17 21:21:31 | \n",
" I am so glad that @GreysABC is tackling the hu... | \n",
" NaN | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 24 | \n",
" 1451858330591318022 | \n",
" 2021-10-23 10:29:34 | \n",
" If you know someone who’s depressed please res... | \n",
" Rwanda | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 25 | \n",
" 1453499016394723330 | \n",
" 2021-10-27 23:09:04 | \n",
" A #grateful #heart will #SeeGod. You will find... | \n",
" Berlin, NJ | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 26 | \n",
" 1453738324598865920 | \n",
" 2021-10-28 15:00:00 | \n",
" “Our world today so desperately hungers for ho... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 27 | \n",
" 1453745900996726785 | \n",
" 2021-10-28 15:30:06 | \n",
" Depression is a bitch that is difficult for me... | \n",
" NaN | \n",
" 0 | \n",
" 3 | \n",
"
\n",
" \n",
" | 28 | \n",
" 1454441137951821824 | \n",
" 2021-10-30 13:32:44 | \n",
" Add to this list #whatsincreased \\n#petrol\\n#d... | \n",
" New Delhi, India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 29 | \n",
" 1456980506160025604 | \n",
" 2021-11-06 13:43:16 | \n",
" \"Hopelessness has surprised me with patience.\"... | \n",
" Planet Earth | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 30 | \n",
" 1457005145510797315 | \n",
" 2021-11-06 15:21:11 | \n",
" “Go if you have to, but remember, don’t come b... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 31 | \n",
" 1457192619184902147 | \n",
" 2021-11-07 03:46:08 | \n",
" Hey @Headspace, I need to believe in something... | \n",
" Santo Mondongo | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 32 | \n",
" 1458953923151212548 | \n",
" 2021-11-12 00:24:55 | \n",
" 2 years ago I attempted #suicide to escape #do... | \n",
" Carpentersville, IL | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 33 | \n",
" 1459449269140787202 | \n",
" 2021-11-13 09:13:15 | \n",
" WARNING: Being deprived of God’s joy will lead... | \n",
" United States | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
"1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
"2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
"3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
"4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
"5 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
"6 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
"7 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n",
"8 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
"9 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
"10 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
"11 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
"12 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
"13 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
"14 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
"15 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n",
"16 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
"17 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
"18 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
"19 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
"20 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
"21 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n",
"22 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n",
"23 1449848070783524864 2021-10-17 21:21:31 ... 1 2\n",
"24 1451858330591318022 2021-10-23 10:29:34 ... 0 1\n",
"25 1453499016394723330 2021-10-27 23:09:04 ... 0 1\n",
"26 1453738324598865920 2021-10-28 15:00:00 ... 0 0\n",
"27 1453745900996726785 2021-10-28 15:30:06 ... 0 3\n",
"28 1454441137951821824 2021-10-30 13:32:44 ... 0 0\n",
"29 1456980506160025604 2021-11-06 13:43:16 ... 0 0\n",
"30 1457005145510797315 2021-11-06 15:21:11 ... 0 0\n",
"31 1457192619184902147 2021-11-07 03:46:08 ... 0 0\n",
"32 1458953923151212548 2021-11-12 00:24:55 ... 0 2\n",
"33 1459449269140787202 2021-11-13 09:13:15 ... 0 0\n",
"\n",
"[34 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zsX2-S8vGGh8"
},
"source": [
"## \"#mentalhealth\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "gdvSCV-oGOP8"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining5(search_query5, num_tweets5, since_id_num5):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang=\"en\", since_id=since_id_num5, \n",
" tweet_mode='extended').items(num_tweets5)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list5[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:\n",
" csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object\n",
" csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Euoe88tsGdkc"
},
"source": [
"search_words5 = \"#mentalhealth\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query5 = search_words5 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0])\n",
"tweets_mining5(search_query5, 1000, latest_tweet)"
],
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "s8rbK0pOGu80"
},
"source": [
"df_mentalhealth_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 20,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "CpmrexYEH9ii",
"outputId": "0b26846b-b32d-44ea-8612-4cfb551bb444"
},
"source": [
"df_mentalhealth_1"
],
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1449685870945185792 | \n",
" 2021-10-17 10:37:00 | \n",
" Sunday's goals. \\n1. Take meds\\n2. Drink 3 lit... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1449686119658840065 | \n",
" 2021-10-17 10:37:59 | \n",
" \"????\" #Mentalhealth\\n\\ni'm tired of fighting... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1449686255185321986 | \n",
" 2021-10-17 10:38:31 | \n",
" Surrounded by people but feeling so alone 😔 \\n... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1449686716168671232 | \n",
" 2021-10-17 10:40:21 | \n",
" I understand my dv worker has emergencies but ... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1449687397776592898 | \n",
" 2021-10-17 10:43:04 | \n",
" Struggling to get out of bed and do things tha... | \n",
" England, United Kingdom | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 6592 | \n",
" 1459531596009283600 | \n",
" 2021-11-13 14:40:23 | \n",
" Let’s make good choices today friends!!! ❤️ #R... | \n",
" Florida, USA | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 6593 | \n",
" 1459532754387976200 | \n",
" 2021-11-13 14:45:00 | \n",
" Oh it’s a dark joke when I say I wanna bedazzl... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 6594 | \n",
" 1459532763942604800 | \n",
" 2021-11-13 14:45:02 | \n",
" I discovered today that clothes shopping is a ... | \n",
" England, United Kingdom | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 6595 | \n",
" 1459532906074935304 | \n",
" 2021-11-13 14:45:36 | \n",
" We composed a tweet thread about our college's... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 6596 | \n",
" 1459533316428754950 | \n",
" 2021-11-13 14:47:14 | \n",
" feels awkward at 1st but don’t know how i feel... | \n",
" Anaheim, CA | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
6597 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1449685870945185792 2021-10-17 10:37:00 ... 0 1\n",
"1 1449686119658840065 2021-10-17 10:37:59 ... 0 0\n",
"2 1449686255185321986 2021-10-17 10:38:31 ... 0 1\n",
"3 1449686716168671232 2021-10-17 10:40:21 ... 0 0\n",
"4 1449687397776592898 2021-10-17 10:43:04 ... 0 0\n",
"... ... ... ... ... ...\n",
"6592 1459531596009283600 2021-11-13 14:40:23 ... 0 1\n",
"6593 1459532754387976200 2021-11-13 14:45:00 ... 0 1\n",
"6594 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n",
"6595 1459532906074935304 2021-11-13 14:45:36 ... 0 1\n",
"6596 1459533316428754950 2021-11-13 14:47:14 ... 0 1\n",
"\n",
"[6597 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Jwcc9Bwdx0ie"
},
"source": [
"## \"#loneliness\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "tfu8ca0Wx1m9"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining6(search_query6, num_tweets6, since_id_num6):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang=\"en\", since_id=since_id_num6, \n",
" tweet_mode='extended').items(num_tweets6)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list6[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:\n",
" csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object\n",
" csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "veyW6kE7z5A0"
},
"source": [
"search_words6 = \"#loneliness\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query6 = search_words6 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0])\n",
"tweets_mining6(search_query6, 10000, latest_tweet)"
],
"execution_count": 23,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bggxtMrn0EGM"
},
"source": [
"df_loneliness_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 24,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "SlXTyO6d0KrH",
"outputId": "a8a7127b-34e5-437e-effd-a1364ff5bad5"
},
"source": [
"df_loneliness_1"
],
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447444376464998400 | \n",
" 2021-10-11 06:10:06 | \n",
" Every year passes but the pain remains the sam... | \n",
" India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447517473679441921 | \n",
" 2021-10-11 11:00:33 | \n",
" In this life, I can't expect things to be in m... | \n",
" Davao Region | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447540227422162949 | \n",
" 2021-10-11 12:30:58 | \n",
" holidays can bring on a sense of loss - of fam... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447564113928863744 | \n",
" 2021-10-11 14:05:53 | \n",
" Must be good to have someone by your side. #Lo... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447599325304000515 | \n",
" 2021-10-11 16:25:48 | \n",
" #Artists without an air of #loneliness , are #... | \n",
" Sulaimanyah, Kurdistan | \n",
" 0 | \n",
" 5 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 306 | \n",
" 1459371193283362820 | \n",
" 2021-11-13 04:03:00 | \n",
" I want someone who loves to take nighttime dri... | \n",
" North Carolina, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 307 | \n",
" 1459473286836989959 | \n",
" 2021-11-13 10:48:41 | \n",
" I have apparently reached the point of #autist... | \n",
" South West, England | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 308 | \n",
" 1459491234473553921 | \n",
" 2021-11-13 12:00:00 | \n",
" Give us a call. Need any advice with #covid19 ... | \n",
" Dublin City, Ireland | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 309 | \n",
" 1459495762908401664 | \n",
" 2021-11-13 12:18:00 | \n",
" fob lyrics trying so hard to be someone you’re... | \n",
" she/they • 18 • scorpio | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 310 | \n",
" 1459513880527441920 | \n",
" 2021-11-13 13:30:00 | \n",
" Give us a call. Need any advice with #covid19 ... | \n",
" Dublin City, Ireland | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
311 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
"1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n",
"2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n",
"3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n",
"4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n",
".. ... ... ... ... ...\n",
"306 1459371193283362820 2021-11-13 04:03:00 ... 0 0\n",
"307 1459473286836989959 2021-11-13 10:48:41 ... 0 1\n",
"308 1459491234473553921 2021-11-13 12:00:00 ... 1 1\n",
"309 1459495762908401664 2021-11-13 12:18:00 ... 0 1\n",
"310 1459513880527441920 2021-11-13 13:30:00 ... 0 0\n",
"\n",
"[311 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "QnHoDxZ70SnD"
},
"source": [
"## \"#itsokaynottobeokay\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "WtQHpt-c0Te1"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining7(search_query7, num_tweets7, since_id_num7):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang=\"en\", since_id=since_id_num7, \n",
" tweet_mode='extended').items(num_tweets7)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list7[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:\n",
" csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object\n",
" csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 26,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TP-dBQTL1vkD"
},
"source": [
"search_words7 = \"#itsokaynottobeokay\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query7 = search_words7 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining7(search_query7, 2000, latest_tweet)"
],
"execution_count": 27,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "IEyjMy_B2hc7"
},
"source": [
"df_itsok_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 28,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "GD5zNft02yGK",
"outputId": "22900167-41bb-4c8a-ca74-80db5d1a70e5"
},
"source": [
"df_itsok_1"
],
"execution_count": 29,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447444376464998400 | \n",
" 2021-10-11 06:10:06 | \n",
" Every year passes but the pain remains the sam... | \n",
" India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447517473679441921 | \n",
" 2021-10-11 11:00:33 | \n",
" In this life, I can't expect things to be in m... | \n",
" Davao Region | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447540227422162949 | \n",
" 2021-10-11 12:30:58 | \n",
" holidays can bring on a sense of loss - of fam... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447564113928863744 | \n",
" 2021-10-11 14:05:53 | \n",
" Must be good to have someone by your side. #Lo... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447599325304000515 | \n",
" 2021-10-11 16:25:48 | \n",
" #Artists without an air of #loneliness , are #... | \n",
" Sulaimanyah, Kurdistan | \n",
" 0 | \n",
" 5 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 160 | \n",
" 1459084076250546178 | \n",
" 2021-11-12 09:02:06 | \n",
" Every problem has a solution if you don’t know... | \n",
" South East, England | \n",
" 0 | \n",
" 10 | \n",
"
\n",
" \n",
" | 161 | \n",
" 1459236894219325441 | \n",
" 2021-11-12 19:09:21 | \n",
" I'm loving @calumscott new song, definitely me... | \n",
" Wrexham, Wales | \n",
" 0 | \n",
" 3 | \n",
"
\n",
" \n",
" | 162 | \n",
" 1459270946485719041 | \n",
" 2021-11-12 21:24:40 | \n",
" You ever stop to acknowledge : would you look... | \n",
" United States | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 163 | \n",
" 1459429100180111361 | \n",
" 2021-11-13 07:53:07 | \n",
" i became teume bcoz of “ #itsokaynottobeokay ”... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 164 | \n",
" 1459458776092786694 | \n",
" 2021-11-13 09:51:02 | \n",
" I don't usually do this but I just want to tha... | \n",
" Leicester, England | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
165 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n",
"1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n",
"2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n",
"3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n",
"4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n",
".. ... ... ... ... ...\n",
"160 1459084076250546178 2021-11-12 09:02:06 ... 0 10\n",
"161 1459236894219325441 2021-11-12 19:09:21 ... 0 3\n",
"162 1459270946485719041 2021-11-12 21:24:40 ... 0 2\n",
"163 1459429100180111361 2021-11-13 07:53:07 ... 0 0\n",
"164 1459458776092786694 2021-11-13 09:51:02 ... 0 1\n",
"\n",
"[165 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 29
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-RXWp6HY44nN"
},
"source": [
"## \"#depression\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "pbZltJ-k45d5"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining8(search_query8, num_tweets8, since_id_num8):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang=\"en\", since_id=since_id_num8, \n",
" tweet_mode='extended').items(num_tweets8)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list8[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:\n",
" csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object\n",
" csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 30,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ghHhnfIO5xMg"
},
"source": [
"search_words8 = \"#depression\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query8 = search_words8 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining8(search_query8, 1000, latest_tweet)"
],
"execution_count": 31,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2tZbCrCQ6BKL"
},
"source": [
"df_depression_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "9vYE-YWt6hsd",
"outputId": "172ccd9a-eb04-4617-eb09-b7bb421126c9"
},
"source": [
"df_depression_1"
],
"execution_count": 33,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447381882828623879 | \n",
" 2021-10-11 02:01:46 | \n",
" #letstalk many suffering from #depression and ... | \n",
" Chicago, IL | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447387707362131970 | \n",
" 2021-10-11 02:24:55 | \n",
" #Harassmentatwork can lead to debilitating men... | \n",
" Lahore | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447396592877805570 | \n",
" 2021-10-11 03:00:13 | \n",
" So . . . my #therapist called my wife and told... | \n",
" If it makes a difference, ask. | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447398472735342600 | \n",
" 2021-10-11 03:07:41 | \n",
" #psychology #love #mentalhealth #therapy #heal... | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447400177510146062 | \n",
" 2021-10-11 03:14:28 | \n",
" #psychology #love #mentalhealth #therapy #heal... | \n",
" NaN | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 4478 | \n",
" 1459517445736124420 | \n",
" 2021-11-13 13:44:10 | \n",
" I've literally cried atleast once a day for th... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4479 | \n",
" 1459521433193877511 | \n",
" 2021-11-13 14:00:00 | \n",
" Black cohosh (Cimicifuga racemosa) is a partic... | \n",
" Global | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4480 | \n",
" 1459527712775847936 | \n",
" 2021-11-13 14:24:58 | \n",
" I mention therapy to him today, his response \"... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4481 | \n",
" 1459531002276192263 | \n",
" 2021-11-13 14:38:02 | \n",
" Finna go to dollar tree and get some organizin... | \n",
" Dallas Texas, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4482 | \n",
" 1459532763942604800 | \n",
" 2021-11-13 14:45:02 | \n",
" I discovered today that clothes shopping is a ... | \n",
" England, United Kingdom | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
4483 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447381882828623879 2021-10-11 02:01:46 ... 0 0\n",
"1 1447387707362131970 2021-10-11 02:24:55 ... 1 1\n",
"2 1447396592877805570 2021-10-11 03:00:13 ... 0 0\n",
"3 1447398472735342600 2021-10-11 03:07:41 ... 1 0\n",
"4 1447400177510146062 2021-10-11 03:14:28 ... 1 4\n",
"... ... ... ... ... ...\n",
"4478 1459517445736124420 2021-11-13 13:44:10 ... 0 0\n",
"4479 1459521433193877511 2021-11-13 14:00:00 ... 1 1\n",
"4480 1459527712775847936 2021-11-13 14:24:58 ... 0 1\n",
"4481 1459531002276192263 2021-11-13 14:38:02 ... 0 0\n",
"4482 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n",
"\n",
"[4483 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 33
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iaBSFYwsUPaI",
"outputId": "7b2a0935-671f-4d94-d364-5dc7a7134e12"
},
"source": [
"## Finding unique values in each column\n",
"for col in df_depression_1:\n",
" print(\"There are \", len(df_depression_1[col].unique()), \"unique values in \", col)"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"There are 3185 unique values in tweet.id\n",
"There are 3182 unique values in created_at\n",
"There are 2818 unique values in text\n",
"There are 939 unique values in location\n",
"There are 23 unique values in retweet\n",
"There are 59 unique values in favorite\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "N2ZER9SmTPzF"
},
"source": [
"## \"#sad\""
]
},
{
"cell_type": "code",
"metadata": {
"id": "EWSDmH8s6iuZ"
},
"source": [
"## Create a function for tweets mining\n",
"def tweets_mining9(search_query9, num_tweets9, since_id_num9):\n",
" # Collect tweets using the Cursor object\n",
" # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
" tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang=\"en\", since_id=since_id_num9, \n",
" tweet_mode='extended').items(num_tweets9)]\n",
" \n",
" # Begin scraping the tweets individually:\n",
" for tweet in tweet_list9[::-1]:\n",
" tweet_id = tweet.id # get Tweet ID result\n",
" created_at = tweet.created_at # get time tweet was created\n",
" text = tweet.full_text # retrieve full tweet text\n",
" location = tweet.user.location # retrieve user location\n",
" retweet = tweet.retweet_count # retrieve number of retweets\n",
" favorite = tweet.favorite_count # retrieve number of likes\n",
" with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:\n",
" csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object\n",
" csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
],
"execution_count": 34,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "5G-4-YnoUAVZ"
},
"source": [
"search_words9 = \"#sad\" # Specifying exact phrase to search\n",
"# Exclude Links, retweets, replies\n",
"search_query9 = search_words9 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
"with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:\n",
" latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
"tweets_mining9(search_query9, 2000, latest_tweet)"
],
"execution_count": 35,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "6ivTsYufUKw2"
},
"source": [
"df_sad_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv\",\n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
],
"execution_count": 36,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "4TjbnQlJUUbA",
"outputId": "4ab3eb84-3d0c-444b-fa61-6d0b9969d3d2"
},
"source": [
"df_sad_1"
],
"execution_count": 37,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447386915502792706 | \n",
" 2021-10-11 02:21:46 | \n",
" Tried to propose to Todd with an air ring duri... | \n",
" MD/DC | \n",
" 0 | \n",
" 4 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447389433553096704 | \n",
" 2021-10-11 02:31:46 | \n",
" Forgetting to bring a post game pint to pickup... | \n",
" Canada | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447390726132625416 | \n",
" 2021-10-11 02:36:54 | \n",
" bro wtf i came to school because of him and he... | \n",
" she / her | cbyf !! | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1447390741706149895 | \n",
" 2021-10-11 02:36:58 | \n",
" I agree with @clint_dempsey on the Yanks not w... | \n",
" Los Angeles, CA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1447391562380554244 | \n",
" 2021-10-11 02:40:14 | \n",
" The amount of people who do not tip for grocer... | \n",
" NaN | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 3517 | \n",
" 1459521498842992642 | \n",
" 2021-11-13 14:00:16 | \n",
" Just got banned from a server F #sad | \n",
" Jakarta Capital Region | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3518 | \n",
" 1459521611997003777 | \n",
" 2021-11-13 14:00:43 | \n",
" I literally cried during my exam and the cam i... | \n",
" بيت أمك | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3519 | \n",
" 1459524263946326017 | \n",
" 2021-11-13 14:11:15 | \n",
" No one can be happy with a guy like me. That's... | \n",
" Varanasi, Uttar Pradesh, India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3520 | \n",
" 1459530315437785095 | \n",
" 2021-11-13 14:35:18 | \n",
" arrived at my house but Am I Home? #deep #sad ... | \n",
" they19sea | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" | 3521 | \n",
" 1459530643591905284 | \n",
" 2021-11-13 14:36:36 | \n",
" Being spoken down to rn at @starbucks and reme... | \n",
" Night Vale, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
3522 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447386915502792706 2021-10-11 02:21:46 ... 0 4\n",
"1 1447389433553096704 2021-10-11 02:31:46 ... 0 1\n",
"2 1447390726132625416 2021-10-11 02:36:54 ... 0 0\n",
"3 1447390741706149895 2021-10-11 02:36:58 ... 0 0\n",
"4 1447391562380554244 2021-10-11 02:40:14 ... 0 1\n",
"... ... ... ... ... ...\n",
"3517 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n",
"3518 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n",
"3519 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n",
"3520 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n",
"3521 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n",
"\n",
"[3522 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 37
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WMQTcPwD38hP"
},
"source": [
"# Combining all the tweets"
]
},
{
"cell_type": "code",
"metadata": {
"id": "aGjcg4Et6ZR9"
},
"source": [
"import glob"
],
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 285
},
"id": "FVBUCENZ4BIQ",
"outputId": "e06fbce1-e125-4ff4-c763-b128e9acf2ea"
},
"source": [
"path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path\n",
"all_files = glob.glob(path + \"/*.csv\")\n",
"\n",
"tweets = []\n",
"\n",
"for filename in all_files:\n",
" df = pd.read_csv(filename, \n",
" names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]) # Convert each csv to a dataframe\n",
" tweets.append(df)\n",
"\n",
"tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes\n",
"#tweets_df.columns=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]\n",
"tweets_df.head()"
],
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447537898572574730 | \n",
" 2021-10-11 12:21:43 | \n",
" Open discussion. Between the Transfer Portal a... | \n",
" Cheyenne Wyoming | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447540582490988553 | \n",
" 2021-10-11 12:32:23 | \n",
" Plenty of things are changing in my life and t... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447807717859491842 | \n",
" 2021-10-12 06:13:53 | \n",
" I feel a little hopeless. Anyone else? #hopele... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1448076026219692033 | \n",
" 2021-10-13 00:00:03 | \n",
" Which is more healthy? Hope, or hopelessness? ... | \n",
" Denver, CO | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1448382047375040513 | \n",
" 2021-10-13 20:16:04 | \n",
" So someone tell me how do I get over #HOPELESS... | \n",
" Portland Or . | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
"1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
"2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
"3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
"4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
"\n",
"[5 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 39
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 581
},
"id": "NIh6Pc_C5BmN",
"outputId": "6ceba47d-7e76-49e4-f459-8b78860e6aae"
},
"source": [
"tweets_df"
],
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tweet.id | \n",
" created_at | \n",
" text | \n",
" location | \n",
" retweet | \n",
" favorite | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1447537898572574730 | \n",
" 2021-10-11 12:21:43 | \n",
" Open discussion. Between the Transfer Portal a... | \n",
" Cheyenne Wyoming | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1447540582490988553 | \n",
" 2021-10-11 12:32:23 | \n",
" Plenty of things are changing in my life and t... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1447807717859491842 | \n",
" 2021-10-12 06:13:53 | \n",
" I feel a little hopeless. Anyone else? #hopele... | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1448076026219692033 | \n",
" 2021-10-13 00:00:03 | \n",
" Which is more healthy? Hope, or hopelessness? ... | \n",
" Denver, CO | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1448382047375040513 | \n",
" 2021-10-13 20:16:04 | \n",
" So someone tell me how do I get over #HOPELESS... | \n",
" Portland Or . | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 24142 | \n",
" 1459521498842992642 | \n",
" 2021-11-13 14:00:16 | \n",
" Just got banned from a server F #sad | \n",
" Jakarta Capital Region | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 24143 | \n",
" 1459521611997003777 | \n",
" 2021-11-13 14:00:43 | \n",
" I literally cried during my exam and the cam i... | \n",
" بيت أمك | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 24144 | \n",
" 1459524263946326017 | \n",
" 2021-11-13 14:11:15 | \n",
" No one can be happy with a guy like me. That's... | \n",
" Varanasi, Uttar Pradesh, India | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 24145 | \n",
" 1459530315437785095 | \n",
" 2021-11-13 14:35:18 | \n",
" arrived at my house but Am I Home? #deep #sad ... | \n",
" they19sea | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" | 24146 | \n",
" 1459530643591905284 | \n",
" 2021-11-13 14:36:36 | \n",
" Being spoken down to rn at @starbucks and reme... | \n",
" Night Vale, USA | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
24147 rows × 6 columns
\n",
"
"
],
"text/plain": [
" tweet.id created_at ... retweet favorite\n",
"0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n",
"1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n",
"2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n",
"3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n",
"4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n",
"... ... ... ... ... ...\n",
"24142 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n",
"24143 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n",
"24144 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n",
"24145 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n",
"24146 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n",
"\n",
"[24147 rows x 6 columns]"
]
},
"metadata": {},
"execution_count": 40
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Yia0nXGnQsiV"
},
"source": [
"tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')"
],
"execution_count": 41,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Zvj3hdFwO2IO"
},
"source": [
"## Data cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GEBn1OyhPDp1"
},
"source": [
"Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any."
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zgrxs9HGOhnN",
"outputId": "f8886c9b-28b7-4429-ebe0-b91ad894f32b"
},
"source": [
"tweets_df.shape #Get number of rows and columns"
],
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(24147, 6)"
]
},
"metadata": {},
"execution_count": 42
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 232
},
"id": "s6rb-N77QIA-",
"outputId": "ae758d07-1cbc-4bc8-988f-8f38777ac201"
},
"source": [
"## Check the data type of each column\n",
"tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})"
],
"execution_count": 43,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data_type | \n",
"
\n",
" \n",
" \n",
" \n",
" | tweet.id | \n",
" int64 | \n",
"
\n",
" \n",
" | created_at | \n",
" object | \n",
"
\n",
" \n",
" | text | \n",
" object | \n",
"
\n",
" \n",
" | location | \n",
" object | \n",
"
\n",
" \n",
" | retweet | \n",
" int64 | \n",
"
\n",
" \n",
" | favorite | \n",
" int64 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data_type\n",
"tweet.id int64\n",
"created_at object\n",
"text object\n",
"location object\n",
"retweet int64\n",
"favorite int64"
]
},
"metadata": {},
"execution_count": 43
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mYuqjbWiPJVK",
"outputId": "997390f8-38b1-41d6-a94d-d3b25ba402c4"
},
"source": [
"## Finding unique values in each column\n",
"for col in tweets_df:\n",
" print(\"There are \", len(tweets_df[col].unique()), \"unique values in \", col)"
],
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"There are 18190 unique values in tweet.id\n",
"There are 18071 unique values in created_at\n",
"There are 17107 unique values in text\n",
"There are 4648 unique values in location\n",
"There are 74 unique values in retweet\n",
"There are 159 unique values in favorite\n"
]
}
]
}
]
}