{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Twitter_API.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "n9mFOtjUGKmk" }, "source": [ "# Tweet mining using Twitter API via Tweepy:" ] }, { "cell_type": "markdown", "metadata": { "id": "-3bUQ54_84g8" }, "source": [ "In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1Bojm_bffNAV", "outputId": "92f04f31-eb1b-4c13-f811-1cad9d759a34" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "7iWDBsjTwEyZ" }, "source": [ "## Tweets mining" ] }, { "cell_type": "code", "metadata": { "id": "TtZk0vyLwWwW" }, "source": [ "!pip install -qqq tweepy" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jobjTBDIwhUl" }, "source": [ "## Import required libraries\n", "import tweepy\n", "from tweepy.streaming import StreamListener\n", "from tweepy import OAuthHandler\n", "from tweepy import Stream\n", "import csv\n", "import pandas as pd\n", "\n", "## Access to twitter API cunsumer_key and access_secret\n", "#import config.ipynb" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Dv5AsxY6iL2s" }, "source": [ "## Twitter API related information\n", "consumer_key = config.API_KEY\n", "consumer_secret = config.API_KEY_SECRET\n", "access_key= config.ACCESS_TOKEN\n", "access_secret = config.ACCESS_TOKEN_SECRET" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "M6mSp-B_vzn-" }, "source": [ "auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API\n", "auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API\n", "api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FHqBQHYpDcz_" }, "source": [ "## depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n", "## \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\", \"#sad\"]" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "0-BvNrToRims" }, "source": [ "## \"#depressed\"" ] }, { "cell_type": "code", "metadata": { "id": "BERTal4NwVNx" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining1(search_query1, num_tweets1, since_id_num1):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang=\"en\", since_id=since_id_num1, \n", " tweet_mode='extended').items(num_tweets1)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list1[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:\n", " csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object\n", " csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8LOXgG5xygnj" }, "source": [ "search_words1 = \"#depressed\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query1 = search_words1 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining1(search_query1, 1000, latest_tweet)" ], "execution_count": 7, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "JSDTPj7Nz5Rh" }, "source": [ "df_depressed_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "aQe7bso7VBZA", "outputId": "bed5b299-8399-4b86-f6d6-630085f308a8" }, "source": [ "df_depressed_1" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014468823669458370572021-10-09 16:56:52I totally need someone to hug me TIGHT and say...NaN01
114468967998605393942021-10-09 17:54:13i plan on committing suicide today or tommorro...NaN01
214469122106729594912021-10-09 18:55:28Exhausted! Absolutely exhausted and my day isn...Lost 🤕08
314469319305372098562021-10-09 20:13:49Im going to get Far Cry 6 and playing video ga...NaN01
414469349144530821132021-10-09 20:25:41Just #depressed haven’t made money in 4 days o...Daddy’s lap.02
.....................
144014592926618488832032021-11-12 22:50:57it gets dark at 5 now. #depressedToronto, Ontario02
144114592954729931530302021-11-12 23:02:07Ignore my tweets, if I tweet, for the next cou...Paisley, Scotland01
144214593235108037591082021-11-13 00:53:32how tf you a psychology major and depressed? l...San Diego, CA00
144314593762075274403852021-11-13 04:22:56Liquors my bestie till my flight tomorrow fml ...Dreamville, LBC♥00
144414594972536980357142021-11-13 12:23:56i signed up for @netflix just so i can watch b...Washington, USA00
\n", "

1445 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1446882366945837057 2021-10-09 16:56:52 ... 0 1\n", "1 1446896799860539394 2021-10-09 17:54:13 ... 0 1\n", "2 1446912210672959491 2021-10-09 18:55:28 ... 0 8\n", "3 1446931930537209856 2021-10-09 20:13:49 ... 0 1\n", "4 1446934914453082113 2021-10-09 20:25:41 ... 0 2\n", "... ... ... ... ... ...\n", "1440 1459292661848883203 2021-11-12 22:50:57 ... 0 2\n", "1441 1459295472993153030 2021-11-12 23:02:07 ... 0 1\n", "1442 1459323510803759108 2021-11-13 00:53:32 ... 0 0\n", "1443 1459376207527440385 2021-11-13 04:22:56 ... 0 0\n", "1444 1459497253698035714 2021-11-13 12:23:56 ... 0 0\n", "\n", "[1445 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gnZnQBdQ8VZL", "outputId": "2dc93be1-17f9-4b9d-d1d5-cab5eafdb544" }, "source": [ "## Finding unique values in each column\n", "for col in df_depressed_1:\n", " print(\"There are \", len(df_depressed_1[col].unique()), \"unique values in \", col)" ], "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 849 unique values in tweet.id\n", "There are 849 unique values in created_at\n", "There are 843 unique values in text\n", "There are 383 unique values in location\n", "There are 7 unique values in retweet\n", "There are 25 unique values in favorite\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "jVSywSxSvYbS" }, "source": [ "### Anxiety and suicide " ] }, { "cell_type": "code", "metadata": { "id": "1UWM-o41vd6Z" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining2(search_query2, num_tweets2, since_id_num2):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang=\"en\", since_id=since_id_num2, \n", " tweet_mode='extended').items(num_tweets2)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list2[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:\n", " csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object\n", " csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "4WS3HYJ_yUPe" }, "source": [ "search_words2 = \"#anxiety\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query2 = search_words2 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining2(search_query2, 2000, latest_tweet)" ], "execution_count": 12, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mMnPf-UoD1gA" }, "source": [ "df_anxiety_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 13, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "SyvsN8-3D73N", "outputId": "d139df05-638a-4a91-e94c-e7560db53069" }, "source": [ "df_anxiety_1" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014470677496546140192021-10-10 05:13:31I can't wait to get the hell out. so I'll jus...NaN00
114470697143798579272021-10-10 05:21:19Morning. All people except me sleeping. @Billy...Queenie's Castle,Yate, S Glos01
214470722033889853462021-10-10 05:31:13On #WorldMentalHealthDay, a big shoutout to my...Bengaluru/Muscat/Palakad/Kochi09
314470723348257546262021-10-10 05:31:44I hate having anxiety about doing stuff that I...Utah, USA00
414470749865318481922021-10-10 05:42:16I am not scared of my ADHD, depression and anx...Wollongong, New South Wales211
.....................
686714592240317779394602021-11-12 18:18:14It’s amazing how everyone runs to me as the su...Pennsylvania, USA00
686814592248085127045162021-11-12 18:21:20Any suggestions on settling the stomach after ...Everywhere, Anywhere00
686914592280472787517472021-11-12 18:34:12Gotta love that superpowered #anxiety taking h...NaN00
687014592295181288939522021-11-12 18:40:02Growth nor healing is linear. Sometimes you ma...London00
687114592305273582223372021-11-12 18:44:03Just read on a YouTube comment how mentally il...NaN00
\n", "

6872 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447067749654614019 2021-10-10 05:13:31 ... 0 0\n", "1 1447069714379857927 2021-10-10 05:21:19 ... 0 1\n", "2 1447072203388985346 2021-10-10 05:31:13 ... 0 9\n", "3 1447072334825754626 2021-10-10 05:31:44 ... 0 0\n", "4 1447074986531848192 2021-10-10 05:42:16 ... 2 11\n", "... ... ... ... ... ...\n", "6867 1459224031777939460 2021-11-12 18:18:14 ... 0 0\n", "6868 1459224808512704516 2021-11-12 18:21:20 ... 0 0\n", "6869 1459228047278751747 2021-11-12 18:34:12 ... 0 0\n", "6870 1459229518128893952 2021-11-12 18:40:02 ... 0 0\n", "6871 1459230527358222337 2021-11-12 18:44:03 ... 0 0\n", "\n", "[6872 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ygvC0l-C9NXp", "outputId": "cef49691-326a-43d4-a7d5-28725aafc5b5" }, "source": [ "## Finding unique values in each column\n", "for col in df_anxiety_1:\n", " print(\"There are \", len(df_anxiety_1[col].unique()), \"unique values in \", col)" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 4738 unique values in tweet.id\n", "There are 4733 unique values in created_at\n", "There are 4342 unique values in text\n", "There are 1381 unique values in location\n", "There are 33 unique values in retweet\n", "There are 80 unique values in favorite\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "iSbEvJo0CVBh" }, "source": [ "## \"#Suicide\"" ] }, { "cell_type": "code", "metadata": { "id": "ofqzhBcR1bj-" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining3(search_query3, num_tweets3, since_id_num3):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang=\"en\", since_id=since_id_num3, \n", " tweet_mode='extended').items(num_tweets3)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list3[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:\n", " csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object\n", " csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 10, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "_wIXzt57Cn3e" }, "source": [ "search_words3 = \"#suicide\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query3 = search_words3 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining3(search_query3, 10000, latest_tweet)" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XkfhTVodENiy" }, "source": [ "df_suicide_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 12, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "8HAqIISVEXFy", "outputId": "0667d586-2e25-4690-95b4-a86f748e9eae" }, "source": [ "df_suicide_1" ], "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014473814740349992962021-10-11 02:00:09#suicide is the strong belief that no matter h...NaN00
114474394294094151722021-10-11 05:50:26\"suicide\"\\nHollowness enough\\nSilence enough\\n...NaN22
214474443764649984002021-10-11 06:10:06Every year passes but the pain remains the sam...India00
314474454694671319062021-10-11 06:14:26Have I told you how much I hate my life😂😂😁 #su...Ohio, USA01
414474613062950133772021-10-11 07:17:22The man responsible for the #CDC policies that...United States12
.....................
71314594463045773639712021-11-13 09:01:28Someone wanted me to tell you. You're beautifu...D(1) Florida00
71414594540599753523202021-11-13 09:32:17It's a regular thing🙂💔\\n#Coimbatore #suicide #...Tiruppur, India03
71514594540736447651852021-11-13 09:32:21#Suicide is not as bad as people make it \\n\\nB...The Chisolm Trail00
71614594955483739340812021-11-13 12:17:09Just Uploaded My Review Of Dear Evan Hansen To...NaN00
71714595250841802670272021-11-13 14:14:31On #WorldKindnessDay we would just like to say...NaN15
\n", "

718 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447381474034999296 2021-10-11 02:00:09 ... 0 0\n", "1 1447439429409415172 2021-10-11 05:50:26 ... 2 2\n", "2 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n", "3 1447445469467131906 2021-10-11 06:14:26 ... 0 1\n", "4 1447461306295013377 2021-10-11 07:17:22 ... 1 2\n", ".. ... ... ... ... ...\n", "713 1459446304577363971 2021-11-13 09:01:28 ... 0 0\n", "714 1459454059975352320 2021-11-13 09:32:17 ... 0 3\n", "715 1459454073644765185 2021-11-13 09:32:21 ... 0 0\n", "716 1459495548373934081 2021-11-13 12:17:09 ... 0 0\n", "717 1459525084180267027 2021-11-13 14:14:31 ... 1 5\n", "\n", "[718 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "metadata": { "id": "ZilsrGx9Ex2i" }, "source": [ "## \"#hopelessness\"" ] }, { "cell_type": "code", "metadata": { "id": "mqFLOv-AE5Lw" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining4(search_query4, num_tweets4, since_id_num4):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang=\"en\", since_id=since_id_num4, \n", " tweet_mode='extended').items(num_tweets4)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list4[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:\n", " csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object\n", " csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 14, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7Pf9avomE-G6" }, "source": [ "search_words4 = \"#hopelessness\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query4 = search_words4 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining4(search_query4, 10000, latest_tweet)" ], "execution_count": 15, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pSauv_5jFAzX" }, "source": [ "df_hopeless_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 16, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "jFjXgpXDFwn1", "outputId": "a063c672-3333-4e3b-c71e-c3329270854e" }, "source": [ "df_hopeless_1" ], "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014475378985725747302021-10-11 12:21:43Open discussion. Between the Transfer Portal a...Cheyenne Wyoming00
114475405824909885532021-10-11 12:32:23Plenty of things are changing in my life and t...NaN00
214478077178594918422021-10-12 06:13:53I feel a little hopeless. Anyone else? #hopele...NaN00
314480760262196920332021-10-13 00:00:03Which is more healthy? Hope, or hopelessness? ...Denver, CO00
414483820473750405132021-10-13 20:16:04So someone tell me how do I get over #HOPELESS...Portland Or .02
514485951451386224642021-10-14 10:22:50No parent deserves to experience the Indian le...Bombay, Dubai14
614488439098413137932021-10-15 02:51:20Being in a #union also looks a lot like being ...Alberta, Canada717
714498480707835248642021-10-17 21:21:31I am so glad that @GreysABC is tackling the hu...NaN01
814475378985725747302021-10-11 12:21:43Open discussion. Between the Transfer Portal a...Cheyenne Wyoming00
914475405824909885532021-10-11 12:32:23Plenty of things are changing in my life and t...NaN00
1014478077178594918422021-10-12 06:13:53I feel a little hopeless. Anyone else? #hopele...NaN00
1114480760262196920332021-10-13 00:00:03Which is more healthy? Hope, or hopelessness? ...Denver, CO00
1214483820473750405132021-10-13 20:16:04So someone tell me how do I get over #HOPELESS...Portland Or .02
1314485951451386224642021-10-14 10:22:50No parent deserves to experience the Indian le...Bombay, Dubai14
1414488439098413137932021-10-15 02:51:20Being in a #union also looks a lot like being ...Alberta, Canada717
1514498480707835248642021-10-17 21:21:31I am so glad that @GreysABC is tackling the hu...NaN01
1614475378985725747302021-10-11 12:21:43Open discussion. Between the Transfer Portal a...Cheyenne Wyoming00
1714475405824909885532021-10-11 12:32:23Plenty of things are changing in my life and t...NaN00
1814478077178594918422021-10-12 06:13:53I feel a little hopeless. Anyone else? #hopele...NaN00
1914480760262196920332021-10-13 00:00:03Which is more healthy? Hope, or hopelessness? ...Denver, CO00
2014483820473750405132021-10-13 20:16:04So someone tell me how do I get over #HOPELESS...Portland Or .02
2114485951451386224642021-10-14 10:22:50No parent deserves to experience the Indian le...Bombay, Dubai14
2214488439098413137932021-10-15 02:51:20Being in a #union also looks a lot like being ...Alberta, Canada717
2314498480707835248642021-10-17 21:21:31I am so glad that @GreysABC is tackling the hu...NaN12
2414518583305913180222021-10-23 10:29:34If you know someone who’s depressed please res...Rwanda01
2514534990163947233302021-10-27 23:09:04A #grateful #heart will #SeeGod. You will find...Berlin, NJ01
2614537383245988659202021-10-28 15:00:00“Our world today so desperately hungers for ho...NaN00
2714537459009967267852021-10-28 15:30:06Depression is a bitch that is difficult for me...NaN03
2814544411379518218242021-10-30 13:32:44Add to this list #whatsincreased \\n#petrol\\n#d...New Delhi, India00
2914569805061600256042021-11-06 13:43:16\"Hopelessness has surprised me with patience.\"...Planet Earth00
3014570051455107973152021-11-06 15:21:11“Go if you have to, but remember, don’t come b...NaN00
3114571926191849021472021-11-07 03:46:08Hey @Headspace, I need to believe in something...Santo Mondongo00
3214589539231512125482021-11-12 00:24:552 years ago I attempted #suicide to escape #do...Carpentersville, IL02
3314594492691407872022021-11-13 09:13:15WARNING: Being deprived of God’s joy will lead...United States00
\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n", "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n", "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n", "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n", "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n", "5 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n", "6 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n", "7 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n", "8 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n", "9 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n", "10 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n", "11 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n", "12 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n", "13 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n", "14 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n", "15 1449848070783524864 2021-10-17 21:21:31 ... 0 1\n", "16 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n", "17 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n", "18 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n", "19 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n", "20 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n", "21 1448595145138622464 2021-10-14 10:22:50 ... 1 4\n", "22 1448843909841313793 2021-10-15 02:51:20 ... 7 17\n", "23 1449848070783524864 2021-10-17 21:21:31 ... 1 2\n", "24 1451858330591318022 2021-10-23 10:29:34 ... 0 1\n", "25 1453499016394723330 2021-10-27 23:09:04 ... 0 1\n", "26 1453738324598865920 2021-10-28 15:00:00 ... 0 0\n", "27 1453745900996726785 2021-10-28 15:30:06 ... 0 3\n", "28 1454441137951821824 2021-10-30 13:32:44 ... 0 0\n", "29 1456980506160025604 2021-11-06 13:43:16 ... 0 0\n", "30 1457005145510797315 2021-11-06 15:21:11 ... 0 0\n", "31 1457192619184902147 2021-11-07 03:46:08 ... 0 0\n", "32 1458953923151212548 2021-11-12 00:24:55 ... 0 2\n", "33 1459449269140787202 2021-11-13 09:13:15 ... 0 0\n", "\n", "[34 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "markdown", "metadata": { "id": "zsX2-S8vGGh8" }, "source": [ "## \"#mentalhealth\"" ] }, { "cell_type": "code", "metadata": { "id": "gdvSCV-oGOP8" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining5(search_query5, num_tweets5, since_id_num5):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang=\"en\", since_id=since_id_num5, \n", " tweet_mode='extended').items(num_tweets5)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list5[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:\n", " csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object\n", " csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 18, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Euoe88tsGdkc" }, "source": [ "search_words5 = \"#mentalhealth\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query5 = search_words5 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0])\n", "tweets_mining5(search_query5, 1000, latest_tweet)" ], "execution_count": 19, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "s8rbK0pOGu80" }, "source": [ "df_mentalhealth_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 20, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "CpmrexYEH9ii", "outputId": "0b26846b-b32d-44ea-8612-4cfb551bb444" }, "source": [ "df_mentalhealth_1" ], "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014496858709451857922021-10-17 10:37:00Sunday's goals. \\n1. Take meds\\n2. Drink 3 lit...NaN01
114496861196588400652021-10-17 10:37:59\"????\" #Mentalhealth\\n\\ni'm tired of fighting...NaN00
214496862551853219862021-10-17 10:38:31Surrounded by people but feeling so alone 😔 \\n...NaN01
314496867161686712322021-10-17 10:40:21I understand my dv worker has emergencies but ...NaN00
414496873977765928982021-10-17 10:43:04Struggling to get out of bed and do things tha...England, United Kingdom00
.....................
659214595315960092836002021-11-13 14:40:23Let’s make good choices today friends!!! ❤️ #R...Florida, USA01
659314595327543879762002021-11-13 14:45:00Oh it’s a dark joke when I say I wanna bedazzl...NaN01
659414595327639426048002021-11-13 14:45:02I discovered today that clothes shopping is a ...England, United Kingdom01
659514595329060749353042021-11-13 14:45:36We composed a tweet thread about our college's...NaN01
659614595333164287549502021-11-13 14:47:14feels awkward at 1st but don’t know how i feel...Anaheim, CA01
\n", "

6597 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1449685870945185792 2021-10-17 10:37:00 ... 0 1\n", "1 1449686119658840065 2021-10-17 10:37:59 ... 0 0\n", "2 1449686255185321986 2021-10-17 10:38:31 ... 0 1\n", "3 1449686716168671232 2021-10-17 10:40:21 ... 0 0\n", "4 1449687397776592898 2021-10-17 10:43:04 ... 0 0\n", "... ... ... ... ... ...\n", "6592 1459531596009283600 2021-11-13 14:40:23 ... 0 1\n", "6593 1459532754387976200 2021-11-13 14:45:00 ... 0 1\n", "6594 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n", "6595 1459532906074935304 2021-11-13 14:45:36 ... 0 1\n", "6596 1459533316428754950 2021-11-13 14:47:14 ... 0 1\n", "\n", "[6597 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "markdown", "metadata": { "id": "Jwcc9Bwdx0ie" }, "source": [ "## \"#loneliness\"" ] }, { "cell_type": "code", "metadata": { "id": "tfu8ca0Wx1m9" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining6(search_query6, num_tweets6, since_id_num6):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang=\"en\", since_id=since_id_num6, \n", " tweet_mode='extended').items(num_tweets6)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list6[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:\n", " csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object\n", " csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 22, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "veyW6kE7z5A0" }, "source": [ "search_words6 = \"#loneliness\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query6 = search_words6 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0])\n", "tweets_mining6(search_query6, 10000, latest_tweet)" ], "execution_count": 23, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bggxtMrn0EGM" }, "source": [ "df_loneliness_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 24, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "SlXTyO6d0KrH", "outputId": "a8a7127b-34e5-437e-effd-a1364ff5bad5" }, "source": [ "df_loneliness_1" ], "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014474443764649984002021-10-11 06:10:06Every year passes but the pain remains the sam...India00
114475174736794419212021-10-11 11:00:33In this life, I can't expect things to be in m...Davao Region00
214475402274221629492021-10-11 12:30:58holidays can bring on a sense of loss - of fam...NaN00
314475641139288637442021-10-11 14:05:53Must be good to have someone by your side. #Lo...NaN00
414475993253040005152021-10-11 16:25:48#Artists without an air of #loneliness , are #...Sulaimanyah, Kurdistan05
.....................
30614593711932833628202021-11-13 04:03:00I want someone who loves to take nighttime dri...North Carolina, USA00
30714594732868369899592021-11-13 10:48:41I have apparently reached the point of #autist...South West, England01
30814594912344735539212021-11-13 12:00:00Give us a call. Need any advice with #covid19 ...Dublin City, Ireland11
30914594957629084016642021-11-13 12:18:00fob lyrics trying so hard to be someone you’re...she/they • 18 • scorpio01
31014595138805274419202021-11-13 13:30:00Give us a call. Need any advice with #covid19 ...Dublin City, Ireland00
\n", "

311 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n", "1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n", "2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n", "3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n", "4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n", ".. ... ... ... ... ...\n", "306 1459371193283362820 2021-11-13 04:03:00 ... 0 0\n", "307 1459473286836989959 2021-11-13 10:48:41 ... 0 1\n", "308 1459491234473553921 2021-11-13 12:00:00 ... 1 1\n", "309 1459495762908401664 2021-11-13 12:18:00 ... 0 1\n", "310 1459513880527441920 2021-11-13 13:30:00 ... 0 0\n", "\n", "[311 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "markdown", "metadata": { "id": "QnHoDxZ70SnD" }, "source": [ "## \"#itsokaynottobeokay\"" ] }, { "cell_type": "code", "metadata": { "id": "WtQHpt-c0Te1" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining7(search_query7, num_tweets7, since_id_num7):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang=\"en\", since_id=since_id_num7, \n", " tweet_mode='extended').items(num_tweets7)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list7[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:\n", " csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object\n", " csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 26, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "TP-dBQTL1vkD" }, "source": [ "search_words7 = \"#itsokaynottobeokay\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query7 = search_words7 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining7(search_query7, 2000, latest_tweet)" ], "execution_count": 27, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IEyjMy_B2hc7" }, "source": [ "df_itsok_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 28, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "GD5zNft02yGK", "outputId": "22900167-41bb-4c8a-ca74-80db5d1a70e5" }, "source": [ "df_itsok_1" ], "execution_count": 29, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014474443764649984002021-10-11 06:10:06Every year passes but the pain remains the sam...India00
114475174736794419212021-10-11 11:00:33In this life, I can't expect things to be in m...Davao Region00
214475402274221629492021-10-11 12:30:58holidays can bring on a sense of loss - of fam...NaN00
314475641139288637442021-10-11 14:05:53Must be good to have someone by your side. #Lo...NaN00
414475993253040005152021-10-11 16:25:48#Artists without an air of #loneliness , are #...Sulaimanyah, Kurdistan05
.....................
16014590840762505461782021-11-12 09:02:06Every problem has a solution if you don’t know...South East, England010
16114592368942193254412021-11-12 19:09:21I'm loving @calumscott new song, definitely me...Wrexham, Wales03
16214592709464857190412021-11-12 21:24:40You ever stop to acknowledge : would you look...United States02
16314594291001801113612021-11-13 07:53:07i became teume bcoz of “ #itsokaynottobeokay ”...NaN00
16414594587760927866942021-11-13 09:51:02I don't usually do this but I just want to tha...Leicester, England01
\n", "

165 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447444376464998400 2021-10-11 06:10:06 ... 0 0\n", "1 1447517473679441921 2021-10-11 11:00:33 ... 0 0\n", "2 1447540227422162949 2021-10-11 12:30:58 ... 0 0\n", "3 1447564113928863744 2021-10-11 14:05:53 ... 0 0\n", "4 1447599325304000515 2021-10-11 16:25:48 ... 0 5\n", ".. ... ... ... ... ...\n", "160 1459084076250546178 2021-11-12 09:02:06 ... 0 10\n", "161 1459236894219325441 2021-11-12 19:09:21 ... 0 3\n", "162 1459270946485719041 2021-11-12 21:24:40 ... 0 2\n", "163 1459429100180111361 2021-11-13 07:53:07 ... 0 0\n", "164 1459458776092786694 2021-11-13 09:51:02 ... 0 1\n", "\n", "[165 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 29 } ] }, { "cell_type": "markdown", "metadata": { "id": "-RXWp6HY44nN" }, "source": [ "## \"#depression\"" ] }, { "cell_type": "code", "metadata": { "id": "pbZltJ-k45d5" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining8(search_query8, num_tweets8, since_id_num8):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang=\"en\", since_id=since_id_num8, \n", " tweet_mode='extended').items(num_tweets8)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list8[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:\n", " csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object\n", " csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 30, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ghHhnfIO5xMg" }, "source": [ "search_words8 = \"#depression\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query8 = search_words8 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining8(search_query8, 1000, latest_tweet)" ], "execution_count": 31, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2tZbCrCQ6BKL" }, "source": [ "df_depression_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 32, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "9vYE-YWt6hsd", "outputId": "172ccd9a-eb04-4617-eb09-b7bb421126c9" }, "source": [ "df_depression_1" ], "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014473818828286238792021-10-11 02:01:46#letstalk many suffering from #depression and ...Chicago, IL00
114473877073621319702021-10-11 02:24:55#Harassmentatwork can lead to debilitating men...Lahore11
214473965928778055702021-10-11 03:00:13So . . . my #therapist called my wife and told...If it makes a difference, ask.00
314473984727353426002021-10-11 03:07:41#psychology #love #mentalhealth #therapy #heal...NaN10
414474001775101460622021-10-11 03:14:28#psychology #love #mentalhealth #therapy #heal...NaN14
.....................
447814595174457361244202021-11-13 13:44:10I've literally cried atleast once a day for th...NaN00
447914595214331938775112021-11-13 14:00:00Black cohosh (Cimicifuga racemosa) is a partic...Global11
448014595277127758479362021-11-13 14:24:58I mention therapy to him today, his response \"...NaN01
448114595310022761922632021-11-13 14:38:02Finna go to dollar tree and get some organizin...Dallas Texas, USA00
448214595327639426048002021-11-13 14:45:02I discovered today that clothes shopping is a ...England, United Kingdom01
\n", "

4483 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447381882828623879 2021-10-11 02:01:46 ... 0 0\n", "1 1447387707362131970 2021-10-11 02:24:55 ... 1 1\n", "2 1447396592877805570 2021-10-11 03:00:13 ... 0 0\n", "3 1447398472735342600 2021-10-11 03:07:41 ... 1 0\n", "4 1447400177510146062 2021-10-11 03:14:28 ... 1 4\n", "... ... ... ... ... ...\n", "4478 1459517445736124420 2021-11-13 13:44:10 ... 0 0\n", "4479 1459521433193877511 2021-11-13 14:00:00 ... 1 1\n", "4480 1459527712775847936 2021-11-13 14:24:58 ... 0 1\n", "4481 1459531002276192263 2021-11-13 14:38:02 ... 0 0\n", "4482 1459532763942604800 2021-11-13 14:45:02 ... 0 1\n", "\n", "[4483 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iaBSFYwsUPaI", "outputId": "7b2a0935-671f-4d94-d364-5dc7a7134e12" }, "source": [ "## Finding unique values in each column\n", "for col in df_depression_1:\n", " print(\"There are \", len(df_depression_1[col].unique()), \"unique values in \", col)" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 3185 unique values in tweet.id\n", "There are 3182 unique values in created_at\n", "There are 2818 unique values in text\n", "There are 939 unique values in location\n", "There are 23 unique values in retweet\n", "There are 59 unique values in favorite\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "N2ZER9SmTPzF" }, "source": [ "## \"#sad\"" ] }, { "cell_type": "code", "metadata": { "id": "EWSDmH8s6iuZ" }, "source": [ "## Create a function for tweets mining\n", "def tweets_mining9(search_query9, num_tweets9, since_id_num9):\n", " # Collect tweets using the Cursor object\n", " # Each item in the iterator has various attributes that you can access to get information about each tweet\n", " tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang=\"en\", since_id=since_id_num9, \n", " tweet_mode='extended').items(num_tweets9)]\n", " \n", " # Begin scraping the tweets individually:\n", " for tweet in tweet_list9[::-1]:\n", " tweet_id = tweet.id # get Tweet ID result\n", " created_at = tweet.created_at # get time tweet was created\n", " text = tweet.full_text # retrieve full tweet text\n", " location = tweet.user.location # retrieve user location\n", " retweet = tweet.retweet_count # retrieve number of retweets\n", " favorite = tweet.favorite_count # retrieve number of likes\n", " with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:\n", " csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object\n", " csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row" ], "execution_count": 34, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5G-4-YnoUAVZ" }, "source": [ "search_words9 = \"#sad\" # Specifying exact phrase to search\n", "# Exclude Links, retweets, replies\n", "search_query9 = search_words9 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n", "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:\n", " latest_tweet = int(list(csv.reader(data))[-1][0]) \n", "tweets_mining9(search_query9, 2000, latest_tweet)" ], "execution_count": 35, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6ivTsYufUKw2" }, "source": [ "df_sad_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv\",\n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])" ], "execution_count": 36, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "4TjbnQlJUUbA", "outputId": "4ab3eb84-3d0c-444b-fa61-6d0b9969d3d2" }, "source": [ "df_sad_1" ], "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014473869155027927062021-10-11 02:21:46Tried to propose to Todd with an air ring duri...MD/DC04
114473894335530967042021-10-11 02:31:46Forgetting to bring a post game pint to pickup...Canada01
214473907261326254162021-10-11 02:36:54bro wtf i came to school because of him and he...she / her | cbyf !!00
314473907417061498952021-10-11 02:36:58I agree with @clint_dempsey on the Yanks not w...Los Angeles, CA00
414473915623805542442021-10-11 02:40:14The amount of people who do not tip for grocer...NaN01
.....................
351714595214988429926422021-11-13 14:00:16Just got banned from a server F #sadJakarta Capital Region01
351814595216119970037772021-11-13 14:00:43I literally cried during my exam and the cam i...بيت أمك00
351914595242639463260172021-11-13 14:11:15No one can be happy with a guy like me. That's...Varanasi, Uttar Pradesh, India00
352014595303154377850952021-11-13 14:35:18arrived at my house but Am I Home? #deep #sad ...they19sea13
352114595306435919052842021-11-13 14:36:36Being spoken down to rn at @starbucks and reme...Night Vale, USA00
\n", "

3522 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447386915502792706 2021-10-11 02:21:46 ... 0 4\n", "1 1447389433553096704 2021-10-11 02:31:46 ... 0 1\n", "2 1447390726132625416 2021-10-11 02:36:54 ... 0 0\n", "3 1447390741706149895 2021-10-11 02:36:58 ... 0 0\n", "4 1447391562380554244 2021-10-11 02:40:14 ... 0 1\n", "... ... ... ... ... ...\n", "3517 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n", "3518 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n", "3519 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n", "3520 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n", "3521 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n", "\n", "[3522 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "markdown", "metadata": { "id": "WMQTcPwD38hP" }, "source": [ "# Combining all the tweets" ] }, { "cell_type": "code", "metadata": { "id": "aGjcg4Et6ZR9" }, "source": [ "import glob" ], "execution_count": 38, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 285 }, "id": "FVBUCENZ4BIQ", "outputId": "e06fbce1-e125-4ff4-c763-b128e9acf2ea" }, "source": [ "path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path\n", "all_files = glob.glob(path + \"/*.csv\")\n", "\n", "tweets = []\n", "\n", "for filename in all_files:\n", " df = pd.read_csv(filename, \n", " names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]) # Convert each csv to a dataframe\n", " tweets.append(df)\n", "\n", "tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes\n", "#tweets_df.columns=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]\n", "tweets_df.head()" ], "execution_count": 39, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014475378985725747302021-10-11 12:21:43Open discussion. Between the Transfer Portal a...Cheyenne Wyoming00
114475405824909885532021-10-11 12:32:23Plenty of things are changing in my life and t...NaN00
214478077178594918422021-10-12 06:13:53I feel a little hopeless. Anyone else? #hopele...NaN00
314480760262196920332021-10-13 00:00:03Which is more healthy? Hope, or hopelessness? ...Denver, CO00
414483820473750405132021-10-13 20:16:04So someone tell me how do I get over #HOPELESS...Portland Or .02
\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n", "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n", "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n", "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n", "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n", "\n", "[5 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 39 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 581 }, "id": "NIh6Pc_C5BmN", "outputId": "6ceba47d-7e76-49e4-f459-8b78860e6aae" }, "source": [ "tweets_df" ], "execution_count": 40, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tweet.idcreated_attextlocationretweetfavorite
014475378985725747302021-10-11 12:21:43Open discussion. Between the Transfer Portal a...Cheyenne Wyoming00
114475405824909885532021-10-11 12:32:23Plenty of things are changing in my life and t...NaN00
214478077178594918422021-10-12 06:13:53I feel a little hopeless. Anyone else? #hopele...NaN00
314480760262196920332021-10-13 00:00:03Which is more healthy? Hope, or hopelessness? ...Denver, CO00
414483820473750405132021-10-13 20:16:04So someone tell me how do I get over #HOPELESS...Portland Or .02
.....................
2414214595214988429926422021-11-13 14:00:16Just got banned from a server F #sadJakarta Capital Region01
2414314595216119970037772021-11-13 14:00:43I literally cried during my exam and the cam i...بيت أمك00
2414414595242639463260172021-11-13 14:11:15No one can be happy with a guy like me. That's...Varanasi, Uttar Pradesh, India00
2414514595303154377850952021-11-13 14:35:18arrived at my house but Am I Home? #deep #sad ...they19sea13
2414614595306435919052842021-11-13 14:36:36Being spoken down to rn at @starbucks and reme...Night Vale, USA00
\n", "

24147 rows × 6 columns

\n", "
" ], "text/plain": [ " tweet.id created_at ... retweet favorite\n", "0 1447537898572574730 2021-10-11 12:21:43 ... 0 0\n", "1 1447540582490988553 2021-10-11 12:32:23 ... 0 0\n", "2 1447807717859491842 2021-10-12 06:13:53 ... 0 0\n", "3 1448076026219692033 2021-10-13 00:00:03 ... 0 0\n", "4 1448382047375040513 2021-10-13 20:16:04 ... 0 2\n", "... ... ... ... ... ...\n", "24142 1459521498842992642 2021-11-13 14:00:16 ... 0 1\n", "24143 1459521611997003777 2021-11-13 14:00:43 ... 0 0\n", "24144 1459524263946326017 2021-11-13 14:11:15 ... 0 0\n", "24145 1459530315437785095 2021-11-13 14:35:18 ... 1 3\n", "24146 1459530643591905284 2021-11-13 14:36:36 ... 0 0\n", "\n", "[24147 rows x 6 columns]" ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "code", "metadata": { "id": "Yia0nXGnQsiV" }, "source": [ "tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')" ], "execution_count": 41, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Zvj3hdFwO2IO" }, "source": [ "## Data cleaning" ] }, { "cell_type": "markdown", "metadata": { "id": "GEBn1OyhPDp1" }, "source": [ "Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zgrxs9HGOhnN", "outputId": "f8886c9b-28b7-4429-ebe0-b91ad894f32b" }, "source": [ "tweets_df.shape #Get number of rows and columns" ], "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(24147, 6)" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 232 }, "id": "s6rb-N77QIA-", "outputId": "ae758d07-1cbc-4bc8-988f-8f38777ac201" }, "source": [ "## Check the data type of each column\n", "tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})" ], "execution_count": 43, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_type
tweet.idint64
created_atobject
textobject
locationobject
retweetint64
favoriteint64
\n", "
" ], "text/plain": [ " data_type\n", "tweet.id int64\n", "created_at object\n", "text object\n", "location object\n", "retweet int64\n", "favorite int64" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mYuqjbWiPJVK", "outputId": "997390f8-38b1-41d6-a94d-d3b25ba402c4" }, "source": [ "## Finding unique values in each column\n", "for col in tweets_df:\n", " print(\"There are \", len(tweets_df[col].unique()), \"unique values in \", col)" ], "execution_count": 45, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 18190 unique values in tweet.id\n", "There are 18071 unique values in created_at\n", "There are 17107 unique values in text\n", "There are 4648 unique values in location\n", "There are 74 unique values in retweet\n", "There are 159 unique values in favorite\n" ] } ] } ] }