{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Twitter_API.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n9mFOtjUGKmk"
      },
      "source": [
        "# Tweet mining using Twitter API via Tweepy:"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-3bUQ54_84g8"
      },
      "source": [
        "In this notebook I am using Tweepy python library to  tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1Bojm_bffNAV",
        "outputId": "92f04f31-eb1b-4c13-f811-1cad9d759a34"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7iWDBsjTwEyZ"
      },
      "source": [
        "## Tweets mining"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TtZk0vyLwWwW"
      },
      "source": [
        "!pip install -qqq tweepy"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jobjTBDIwhUl"
      },
      "source": [
        "## Import required libraries\n",
        "import tweepy\n",
        "from tweepy.streaming import StreamListener\n",
        "from tweepy import OAuthHandler\n",
        "from tweepy import Stream\n",
        "import csv\n",
        "import pandas as pd\n",
        "\n",
        "## Access to twitter API cunsumer_key and access_secret\n",
        "#import config.ipynb"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Dv5AsxY6iL2s"
      },
      "source": [
        "## Twitter API related information\n",
        "consumer_key = config.API_KEY\n",
        "consumer_secret = config.API_KEY_SECRET\n",
        "access_key= config.ACCESS_TOKEN\n",
        "access_secret = config.ACCESS_TOKEN_SECRET"
      ],
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M6mSp-B_vzn-"
      },
      "source": [
        "auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API\n",
        "auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API\n",
        "api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FHqBQHYpDcz_"
      },
      "source": [
        "## depress_tags = [\"#depressed\", \"#anxiety\", \"#depression\", \"#suicide\", \"#mentalhealth\"\n",
        "##                \"#loneliness\", \"#hopelessness\", \"#itsokaynottobeokay\", \"#sad\"]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0-BvNrToRims"
      },
      "source": [
        "## \"#depressed\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BERTal4NwVNx"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining1(search_query1, num_tweets1, since_id_num1):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang=\"en\", since_id=since_id_num1, \n",
        "                                                    tweet_mode='extended').items(num_tweets1)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list1[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:\n",
        "      csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object\n",
        "      csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8LOXgG5xygnj"
      },
      "source": [
        "search_words1 = \"#depressed\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query1 = search_words1 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining1(search_query1, 1000, latest_tweet)"
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JSDTPj7Nz5Rh"
      },
      "source": [
        "df_depressed_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "aQe7bso7VBZA",
        "outputId": "bed5b299-8399-4b86-f6d6-630085f308a8"
      },
      "source": [
        "df_depressed_1"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1446882366945837057</td>\n",
              "      <td>2021-10-09 16:56:52</td>\n",
              "      <td>I totally need someone to hug me TIGHT and say...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1446896799860539394</td>\n",
              "      <td>2021-10-09 17:54:13</td>\n",
              "      <td>i plan on committing suicide today or tommorro...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1446912210672959491</td>\n",
              "      <td>2021-10-09 18:55:28</td>\n",
              "      <td>Exhausted! Absolutely exhausted and my day isn...</td>\n",
              "      <td>Lost 🤕</td>\n",
              "      <td>0</td>\n",
              "      <td>8</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1446931930537209856</td>\n",
              "      <td>2021-10-09 20:13:49</td>\n",
              "      <td>Im going to get Far Cry 6 and playing video ga...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1446934914453082113</td>\n",
              "      <td>2021-10-09 20:25:41</td>\n",
              "      <td>Just #depressed haven’t made money in 4 days o...</td>\n",
              "      <td>Daddy’s lap.</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1440</th>\n",
              "      <td>1459292661848883203</td>\n",
              "      <td>2021-11-12 22:50:57</td>\n",
              "      <td>it gets dark at 5 now. #depressed</td>\n",
              "      <td>Toronto, Ontario</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1441</th>\n",
              "      <td>1459295472993153030</td>\n",
              "      <td>2021-11-12 23:02:07</td>\n",
              "      <td>Ignore my tweets, if I tweet, for the next cou...</td>\n",
              "      <td>Paisley, Scotland</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1442</th>\n",
              "      <td>1459323510803759108</td>\n",
              "      <td>2021-11-13 00:53:32</td>\n",
              "      <td>how tf you a psychology major and depressed? l...</td>\n",
              "      <td>San Diego, CA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1443</th>\n",
              "      <td>1459376207527440385</td>\n",
              "      <td>2021-11-13 04:22:56</td>\n",
              "      <td>Liquors my bestie till my flight tomorrow fml ...</td>\n",
              "      <td>Dreamville, LBC♥</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1444</th>\n",
              "      <td>1459497253698035714</td>\n",
              "      <td>2021-11-13 12:23:56</td>\n",
              "      <td>i signed up for @netflix just so i can watch b...</td>\n",
              "      <td>Washington, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>1445 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                 tweet.id           created_at  ... retweet favorite\n",
              "0     1446882366945837057  2021-10-09 16:56:52  ...       0        1\n",
              "1     1446896799860539394  2021-10-09 17:54:13  ...       0        1\n",
              "2     1446912210672959491  2021-10-09 18:55:28  ...       0        8\n",
              "3     1446931930537209856  2021-10-09 20:13:49  ...       0        1\n",
              "4     1446934914453082113  2021-10-09 20:25:41  ...       0        2\n",
              "...                   ...                  ...  ...     ...      ...\n",
              "1440  1459292661848883203  2021-11-12 22:50:57  ...       0        2\n",
              "1441  1459295472993153030  2021-11-12 23:02:07  ...       0        1\n",
              "1442  1459323510803759108  2021-11-13 00:53:32  ...       0        0\n",
              "1443  1459376207527440385  2021-11-13 04:22:56  ...       0        0\n",
              "1444  1459497253698035714  2021-11-13 12:23:56  ...       0        0\n",
              "\n",
              "[1445 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gnZnQBdQ8VZL",
        "outputId": "2dc93be1-17f9-4b9d-d1d5-cab5eafdb544"
      },
      "source": [
        "## Finding unique values in each column\n",
        "for col in df_depressed_1:\n",
        "    print(\"There are \", len(df_depressed_1[col].unique()), \"unique values in \", col)"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are  849 unique values in  tweet.id\n",
            "There are  849 unique values in  created_at\n",
            "There are  843 unique values in  text\n",
            "There are  383 unique values in  location\n",
            "There are  7 unique values in  retweet\n",
            "There are  25 unique values in  favorite\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jVSywSxSvYbS"
      },
      "source": [
        "### Anxiety and suicide "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1UWM-o41vd6Z"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining2(search_query2, num_tweets2, since_id_num2):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang=\"en\", since_id=since_id_num2, \n",
        "                                                    tweet_mode='extended').items(num_tweets2)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list2[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:\n",
        "      csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object\n",
        "      csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4WS3HYJ_yUPe"
      },
      "source": [
        "search_words2 = \"#anxiety\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query2 = search_words2 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining2(search_query2, 2000, latest_tweet)"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mMnPf-UoD1gA"
      },
      "source": [
        "df_anxiety_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "SyvsN8-3D73N",
        "outputId": "d139df05-638a-4a91-e94c-e7560db53069"
      },
      "source": [
        "df_anxiety_1"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447067749654614019</td>\n",
              "      <td>2021-10-10 05:13:31</td>\n",
              "      <td>I  can't wait to get the hell out. so I'll jus...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447069714379857927</td>\n",
              "      <td>2021-10-10 05:21:19</td>\n",
              "      <td>Morning. All people except me sleeping. @Billy...</td>\n",
              "      <td>Queenie's Castle,Yate, S Glos</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447072203388985346</td>\n",
              "      <td>2021-10-10 05:31:13</td>\n",
              "      <td>On #WorldMentalHealthDay, a big shoutout to my...</td>\n",
              "      <td>Bengaluru/Muscat/Palakad/Kochi</td>\n",
              "      <td>0</td>\n",
              "      <td>9</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447072334825754626</td>\n",
              "      <td>2021-10-10 05:31:44</td>\n",
              "      <td>I hate having anxiety about doing stuff that I...</td>\n",
              "      <td>Utah, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447074986531848192</td>\n",
              "      <td>2021-10-10 05:42:16</td>\n",
              "      <td>I am not scared of my ADHD, depression and anx...</td>\n",
              "      <td>Wollongong, New South Wales</td>\n",
              "      <td>2</td>\n",
              "      <td>11</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6867</th>\n",
              "      <td>1459224031777939460</td>\n",
              "      <td>2021-11-12 18:18:14</td>\n",
              "      <td>It’s amazing how everyone runs to me as the su...</td>\n",
              "      <td>Pennsylvania, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6868</th>\n",
              "      <td>1459224808512704516</td>\n",
              "      <td>2021-11-12 18:21:20</td>\n",
              "      <td>Any suggestions on settling the stomach after ...</td>\n",
              "      <td>Everywhere, Anywhere</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6869</th>\n",
              "      <td>1459228047278751747</td>\n",
              "      <td>2021-11-12 18:34:12</td>\n",
              "      <td>Gotta love that superpowered #anxiety taking h...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6870</th>\n",
              "      <td>1459229518128893952</td>\n",
              "      <td>2021-11-12 18:40:02</td>\n",
              "      <td>Growth nor healing is linear. Sometimes you ma...</td>\n",
              "      <td>London</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6871</th>\n",
              "      <td>1459230527358222337</td>\n",
              "      <td>2021-11-12 18:44:03</td>\n",
              "      <td>Just read on a YouTube comment how mentally il...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>6872 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                 tweet.id           created_at  ... retweet favorite\n",
              "0     1447067749654614019  2021-10-10 05:13:31  ...       0        0\n",
              "1     1447069714379857927  2021-10-10 05:21:19  ...       0        1\n",
              "2     1447072203388985346  2021-10-10 05:31:13  ...       0        9\n",
              "3     1447072334825754626  2021-10-10 05:31:44  ...       0        0\n",
              "4     1447074986531848192  2021-10-10 05:42:16  ...       2       11\n",
              "...                   ...                  ...  ...     ...      ...\n",
              "6867  1459224031777939460  2021-11-12 18:18:14  ...       0        0\n",
              "6868  1459224808512704516  2021-11-12 18:21:20  ...       0        0\n",
              "6869  1459228047278751747  2021-11-12 18:34:12  ...       0        0\n",
              "6870  1459229518128893952  2021-11-12 18:40:02  ...       0        0\n",
              "6871  1459230527358222337  2021-11-12 18:44:03  ...       0        0\n",
              "\n",
              "[6872 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ygvC0l-C9NXp",
        "outputId": "cef49691-326a-43d4-a7d5-28725aafc5b5"
      },
      "source": [
        "## Finding unique values in each column\n",
        "for col in df_anxiety_1:\n",
        "    print(\"There are \", len(df_anxiety_1[col].unique()), \"unique values in \", col)"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are  4738 unique values in  tweet.id\n",
            "There are  4733 unique values in  created_at\n",
            "There are  4342 unique values in  text\n",
            "There are  1381 unique values in  location\n",
            "There are  33 unique values in  retweet\n",
            "There are  80 unique values in  favorite\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iSbEvJo0CVBh"
      },
      "source": [
        "## \"#Suicide\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ofqzhBcR1bj-"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining3(search_query3, num_tweets3, since_id_num3):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang=\"en\", since_id=since_id_num3, \n",
        "                                                    tweet_mode='extended').items(num_tweets3)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list3[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:\n",
        "      csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object\n",
        "      csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_wIXzt57Cn3e"
      },
      "source": [
        "search_words3 = \"#suicide\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query3 = search_words3 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining3(search_query3, 10000, latest_tweet)"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XkfhTVodENiy"
      },
      "source": [
        "df_suicide_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "8HAqIISVEXFy",
        "outputId": "0667d586-2e25-4690-95b4-a86f748e9eae"
      },
      "source": [
        "df_suicide_1"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447381474034999296</td>\n",
              "      <td>2021-10-11 02:00:09</td>\n",
              "      <td>#suicide is the strong belief that no matter h...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447439429409415172</td>\n",
              "      <td>2021-10-11 05:50:26</td>\n",
              "      <td>\"suicide\"\\nHollowness enough\\nSilence enough\\n...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447444376464998400</td>\n",
              "      <td>2021-10-11 06:10:06</td>\n",
              "      <td>Every year passes but the pain remains the sam...</td>\n",
              "      <td>India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447445469467131906</td>\n",
              "      <td>2021-10-11 06:14:26</td>\n",
              "      <td>Have I told you how much I hate my life😂😂😁 #su...</td>\n",
              "      <td>Ohio, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447461306295013377</td>\n",
              "      <td>2021-10-11 07:17:22</td>\n",
              "      <td>The man responsible for the #CDC policies that...</td>\n",
              "      <td>United States</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>713</th>\n",
              "      <td>1459446304577363971</td>\n",
              "      <td>2021-11-13 09:01:28</td>\n",
              "      <td>Someone wanted me to tell you. You're beautifu...</td>\n",
              "      <td>D(1) Florida</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>714</th>\n",
              "      <td>1459454059975352320</td>\n",
              "      <td>2021-11-13 09:32:17</td>\n",
              "      <td>It's a regular thing🙂💔\\n#Coimbatore #suicide #...</td>\n",
              "      <td>Tiruppur, India</td>\n",
              "      <td>0</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>715</th>\n",
              "      <td>1459454073644765185</td>\n",
              "      <td>2021-11-13 09:32:21</td>\n",
              "      <td>#Suicide is not as bad as people make it \\n\\nB...</td>\n",
              "      <td>The Chisolm Trail</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>716</th>\n",
              "      <td>1459495548373934081</td>\n",
              "      <td>2021-11-13 12:17:09</td>\n",
              "      <td>Just Uploaded My Review Of Dear Evan Hansen To...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>717</th>\n",
              "      <td>1459525084180267027</td>\n",
              "      <td>2021-11-13 14:14:31</td>\n",
              "      <td>On #WorldKindnessDay we would just like to say...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>718 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                tweet.id           created_at  ... retweet favorite\n",
              "0    1447381474034999296  2021-10-11 02:00:09  ...       0        0\n",
              "1    1447439429409415172  2021-10-11 05:50:26  ...       2        2\n",
              "2    1447444376464998400  2021-10-11 06:10:06  ...       0        0\n",
              "3    1447445469467131906  2021-10-11 06:14:26  ...       0        1\n",
              "4    1447461306295013377  2021-10-11 07:17:22  ...       1        2\n",
              "..                   ...                  ...  ...     ...      ...\n",
              "713  1459446304577363971  2021-11-13 09:01:28  ...       0        0\n",
              "714  1459454059975352320  2021-11-13 09:32:17  ...       0        3\n",
              "715  1459454073644765185  2021-11-13 09:32:21  ...       0        0\n",
              "716  1459495548373934081  2021-11-13 12:17:09  ...       0        0\n",
              "717  1459525084180267027  2021-11-13 14:14:31  ...       1        5\n",
              "\n",
              "[718 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZilsrGx9Ex2i"
      },
      "source": [
        "## \"#hopelessness\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mqFLOv-AE5Lw"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining4(search_query4, num_tweets4, since_id_num4):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang=\"en\", since_id=since_id_num4, \n",
        "                                                    tweet_mode='extended').items(num_tweets4)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list4[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:\n",
        "      csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object\n",
        "      csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7Pf9avomE-G6"
      },
      "source": [
        "search_words4 = \"#hopelessness\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query4 = search_words4 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining4(search_query4, 10000, latest_tweet)"
      ],
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pSauv_5jFAzX"
      },
      "source": [
        "df_hopeless_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "jFjXgpXDFwn1",
        "outputId": "a063c672-3333-4e3b-c71e-c3329270854e"
      },
      "source": [
        "df_hopeless_1"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447537898572574730</td>\n",
              "      <td>2021-10-11 12:21:43</td>\n",
              "      <td>Open discussion. Between the Transfer Portal a...</td>\n",
              "      <td>Cheyenne Wyoming</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447540582490988553</td>\n",
              "      <td>2021-10-11 12:32:23</td>\n",
              "      <td>Plenty of things are changing in my life and t...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447807717859491842</td>\n",
              "      <td>2021-10-12 06:13:53</td>\n",
              "      <td>I feel a little hopeless. Anyone else? #hopele...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1448076026219692033</td>\n",
              "      <td>2021-10-13 00:00:03</td>\n",
              "      <td>Which is more healthy? Hope, or hopelessness? ...</td>\n",
              "      <td>Denver, CO</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1448382047375040513</td>\n",
              "      <td>2021-10-13 20:16:04</td>\n",
              "      <td>So someone tell me how do I get over #HOPELESS...</td>\n",
              "      <td>Portland Or .</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>1448595145138622464</td>\n",
              "      <td>2021-10-14 10:22:50</td>\n",
              "      <td>No parent deserves to experience the Indian le...</td>\n",
              "      <td>Bombay, Dubai</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>1448843909841313793</td>\n",
              "      <td>2021-10-15 02:51:20</td>\n",
              "      <td>Being in a #union also looks a lot like being ...</td>\n",
              "      <td>Alberta, Canada</td>\n",
              "      <td>7</td>\n",
              "      <td>17</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>1449848070783524864</td>\n",
              "      <td>2021-10-17 21:21:31</td>\n",
              "      <td>I am so glad that @GreysABC is tackling the hu...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>1447537898572574730</td>\n",
              "      <td>2021-10-11 12:21:43</td>\n",
              "      <td>Open discussion. Between the Transfer Portal a...</td>\n",
              "      <td>Cheyenne Wyoming</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>1447540582490988553</td>\n",
              "      <td>2021-10-11 12:32:23</td>\n",
              "      <td>Plenty of things are changing in my life and t...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>1447807717859491842</td>\n",
              "      <td>2021-10-12 06:13:53</td>\n",
              "      <td>I feel a little hopeless. Anyone else? #hopele...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>1448076026219692033</td>\n",
              "      <td>2021-10-13 00:00:03</td>\n",
              "      <td>Which is more healthy? Hope, or hopelessness? ...</td>\n",
              "      <td>Denver, CO</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>1448382047375040513</td>\n",
              "      <td>2021-10-13 20:16:04</td>\n",
              "      <td>So someone tell me how do I get over #HOPELESS...</td>\n",
              "      <td>Portland Or .</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>13</th>\n",
              "      <td>1448595145138622464</td>\n",
              "      <td>2021-10-14 10:22:50</td>\n",
              "      <td>No parent deserves to experience the Indian le...</td>\n",
              "      <td>Bombay, Dubai</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>1448843909841313793</td>\n",
              "      <td>2021-10-15 02:51:20</td>\n",
              "      <td>Being in a #union also looks a lot like being ...</td>\n",
              "      <td>Alberta, Canada</td>\n",
              "      <td>7</td>\n",
              "      <td>17</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>15</th>\n",
              "      <td>1449848070783524864</td>\n",
              "      <td>2021-10-17 21:21:31</td>\n",
              "      <td>I am so glad that @GreysABC is tackling the hu...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16</th>\n",
              "      <td>1447537898572574730</td>\n",
              "      <td>2021-10-11 12:21:43</td>\n",
              "      <td>Open discussion. Between the Transfer Portal a...</td>\n",
              "      <td>Cheyenne Wyoming</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17</th>\n",
              "      <td>1447540582490988553</td>\n",
              "      <td>2021-10-11 12:32:23</td>\n",
              "      <td>Plenty of things are changing in my life and t...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>18</th>\n",
              "      <td>1447807717859491842</td>\n",
              "      <td>2021-10-12 06:13:53</td>\n",
              "      <td>I feel a little hopeless. Anyone else? #hopele...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>19</th>\n",
              "      <td>1448076026219692033</td>\n",
              "      <td>2021-10-13 00:00:03</td>\n",
              "      <td>Which is more healthy? Hope, or hopelessness? ...</td>\n",
              "      <td>Denver, CO</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>20</th>\n",
              "      <td>1448382047375040513</td>\n",
              "      <td>2021-10-13 20:16:04</td>\n",
              "      <td>So someone tell me how do I get over #HOPELESS...</td>\n",
              "      <td>Portland Or .</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>21</th>\n",
              "      <td>1448595145138622464</td>\n",
              "      <td>2021-10-14 10:22:50</td>\n",
              "      <td>No parent deserves to experience the Indian le...</td>\n",
              "      <td>Bombay, Dubai</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>22</th>\n",
              "      <td>1448843909841313793</td>\n",
              "      <td>2021-10-15 02:51:20</td>\n",
              "      <td>Being in a #union also looks a lot like being ...</td>\n",
              "      <td>Alberta, Canada</td>\n",
              "      <td>7</td>\n",
              "      <td>17</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23</th>\n",
              "      <td>1449848070783524864</td>\n",
              "      <td>2021-10-17 21:21:31</td>\n",
              "      <td>I am so glad that @GreysABC is tackling the hu...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24</th>\n",
              "      <td>1451858330591318022</td>\n",
              "      <td>2021-10-23 10:29:34</td>\n",
              "      <td>If you know someone who’s depressed please res...</td>\n",
              "      <td>Rwanda</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25</th>\n",
              "      <td>1453499016394723330</td>\n",
              "      <td>2021-10-27 23:09:04</td>\n",
              "      <td>A #grateful #heart will #SeeGod. You will find...</td>\n",
              "      <td>Berlin, NJ</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>26</th>\n",
              "      <td>1453738324598865920</td>\n",
              "      <td>2021-10-28 15:00:00</td>\n",
              "      <td>“Our world today so desperately hungers for ho...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27</th>\n",
              "      <td>1453745900996726785</td>\n",
              "      <td>2021-10-28 15:30:06</td>\n",
              "      <td>Depression is a bitch that is difficult for me...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>28</th>\n",
              "      <td>1454441137951821824</td>\n",
              "      <td>2021-10-30 13:32:44</td>\n",
              "      <td>Add to this list #whatsincreased \\n#petrol\\n#d...</td>\n",
              "      <td>New Delhi, India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>29</th>\n",
              "      <td>1456980506160025604</td>\n",
              "      <td>2021-11-06 13:43:16</td>\n",
              "      <td>\"Hopelessness has surprised me with patience.\"...</td>\n",
              "      <td>Planet Earth</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>30</th>\n",
              "      <td>1457005145510797315</td>\n",
              "      <td>2021-11-06 15:21:11</td>\n",
              "      <td>“Go if you have to, but remember, don’t come b...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>31</th>\n",
              "      <td>1457192619184902147</td>\n",
              "      <td>2021-11-07 03:46:08</td>\n",
              "      <td>Hey @Headspace, I need to believe in something...</td>\n",
              "      <td>Santo Mondongo</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>32</th>\n",
              "      <td>1458953923151212548</td>\n",
              "      <td>2021-11-12 00:24:55</td>\n",
              "      <td>2 years ago I attempted #suicide to escape #do...</td>\n",
              "      <td>Carpentersville, IL</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>33</th>\n",
              "      <td>1459449269140787202</td>\n",
              "      <td>2021-11-13 09:13:15</td>\n",
              "      <td>WARNING: Being deprived of God’s joy will lead...</td>\n",
              "      <td>United States</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "               tweet.id           created_at  ... retweet favorite\n",
              "0   1447537898572574730  2021-10-11 12:21:43  ...       0        0\n",
              "1   1447540582490988553  2021-10-11 12:32:23  ...       0        0\n",
              "2   1447807717859491842  2021-10-12 06:13:53  ...       0        0\n",
              "3   1448076026219692033  2021-10-13 00:00:03  ...       0        0\n",
              "4   1448382047375040513  2021-10-13 20:16:04  ...       0        2\n",
              "5   1448595145138622464  2021-10-14 10:22:50  ...       1        4\n",
              "6   1448843909841313793  2021-10-15 02:51:20  ...       7       17\n",
              "7   1449848070783524864  2021-10-17 21:21:31  ...       0        1\n",
              "8   1447537898572574730  2021-10-11 12:21:43  ...       0        0\n",
              "9   1447540582490988553  2021-10-11 12:32:23  ...       0        0\n",
              "10  1447807717859491842  2021-10-12 06:13:53  ...       0        0\n",
              "11  1448076026219692033  2021-10-13 00:00:03  ...       0        0\n",
              "12  1448382047375040513  2021-10-13 20:16:04  ...       0        2\n",
              "13  1448595145138622464  2021-10-14 10:22:50  ...       1        4\n",
              "14  1448843909841313793  2021-10-15 02:51:20  ...       7       17\n",
              "15  1449848070783524864  2021-10-17 21:21:31  ...       0        1\n",
              "16  1447537898572574730  2021-10-11 12:21:43  ...       0        0\n",
              "17  1447540582490988553  2021-10-11 12:32:23  ...       0        0\n",
              "18  1447807717859491842  2021-10-12 06:13:53  ...       0        0\n",
              "19  1448076026219692033  2021-10-13 00:00:03  ...       0        0\n",
              "20  1448382047375040513  2021-10-13 20:16:04  ...       0        2\n",
              "21  1448595145138622464  2021-10-14 10:22:50  ...       1        4\n",
              "22  1448843909841313793  2021-10-15 02:51:20  ...       7       17\n",
              "23  1449848070783524864  2021-10-17 21:21:31  ...       1        2\n",
              "24  1451858330591318022  2021-10-23 10:29:34  ...       0        1\n",
              "25  1453499016394723330  2021-10-27 23:09:04  ...       0        1\n",
              "26  1453738324598865920  2021-10-28 15:00:00  ...       0        0\n",
              "27  1453745900996726785  2021-10-28 15:30:06  ...       0        3\n",
              "28  1454441137951821824  2021-10-30 13:32:44  ...       0        0\n",
              "29  1456980506160025604  2021-11-06 13:43:16  ...       0        0\n",
              "30  1457005145510797315  2021-11-06 15:21:11  ...       0        0\n",
              "31  1457192619184902147  2021-11-07 03:46:08  ...       0        0\n",
              "32  1458953923151212548  2021-11-12 00:24:55  ...       0        2\n",
              "33  1459449269140787202  2021-11-13 09:13:15  ...       0        0\n",
              "\n",
              "[34 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zsX2-S8vGGh8"
      },
      "source": [
        "## \"#mentalhealth\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gdvSCV-oGOP8"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining5(search_query5, num_tweets5, since_id_num5):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang=\"en\", since_id=since_id_num5, \n",
        "                                                    tweet_mode='extended').items(num_tweets5)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list5[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:\n",
        "      csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object\n",
        "      csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Euoe88tsGdkc"
      },
      "source": [
        "search_words5 = \"#mentalhealth\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query5 = search_words5 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0])\n",
        "tweets_mining5(search_query5, 1000, latest_tweet)"
      ],
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "s8rbK0pOGu80"
      },
      "source": [
        "df_mentalhealth_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "CpmrexYEH9ii",
        "outputId": "0b26846b-b32d-44ea-8612-4cfb551bb444"
      },
      "source": [
        "df_mentalhealth_1"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1449685870945185792</td>\n",
              "      <td>2021-10-17 10:37:00</td>\n",
              "      <td>Sunday's goals. \\n1. Take meds\\n2. Drink 3 lit...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1449686119658840065</td>\n",
              "      <td>2021-10-17 10:37:59</td>\n",
              "      <td>\"????\"  #Mentalhealth\\n\\ni'm tired of fighting...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1449686255185321986</td>\n",
              "      <td>2021-10-17 10:38:31</td>\n",
              "      <td>Surrounded by people but feeling so alone 😔 \\n...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1449686716168671232</td>\n",
              "      <td>2021-10-17 10:40:21</td>\n",
              "      <td>I understand my dv worker has emergencies but ...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1449687397776592898</td>\n",
              "      <td>2021-10-17 10:43:04</td>\n",
              "      <td>Struggling to get out of bed and do things tha...</td>\n",
              "      <td>England, United Kingdom</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6592</th>\n",
              "      <td>1459531596009283600</td>\n",
              "      <td>2021-11-13 14:40:23</td>\n",
              "      <td>Let’s make good choices today friends!!! ❤️ #R...</td>\n",
              "      <td>Florida, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6593</th>\n",
              "      <td>1459532754387976200</td>\n",
              "      <td>2021-11-13 14:45:00</td>\n",
              "      <td>Oh it’s a dark joke when I say I wanna bedazzl...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6594</th>\n",
              "      <td>1459532763942604800</td>\n",
              "      <td>2021-11-13 14:45:02</td>\n",
              "      <td>I discovered today that clothes shopping is a ...</td>\n",
              "      <td>England, United Kingdom</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6595</th>\n",
              "      <td>1459532906074935304</td>\n",
              "      <td>2021-11-13 14:45:36</td>\n",
              "      <td>We composed a tweet thread about our college's...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6596</th>\n",
              "      <td>1459533316428754950</td>\n",
              "      <td>2021-11-13 14:47:14</td>\n",
              "      <td>feels awkward at 1st but don’t know how i feel...</td>\n",
              "      <td>Anaheim, CA</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>6597 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                 tweet.id           created_at  ... retweet favorite\n",
              "0     1449685870945185792  2021-10-17 10:37:00  ...       0        1\n",
              "1     1449686119658840065  2021-10-17 10:37:59  ...       0        0\n",
              "2     1449686255185321986  2021-10-17 10:38:31  ...       0        1\n",
              "3     1449686716168671232  2021-10-17 10:40:21  ...       0        0\n",
              "4     1449687397776592898  2021-10-17 10:43:04  ...       0        0\n",
              "...                   ...                  ...  ...     ...      ...\n",
              "6592  1459531596009283600  2021-11-13 14:40:23  ...       0        1\n",
              "6593  1459532754387976200  2021-11-13 14:45:00  ...       0        1\n",
              "6594  1459532763942604800  2021-11-13 14:45:02  ...       0        1\n",
              "6595  1459532906074935304  2021-11-13 14:45:36  ...       0        1\n",
              "6596  1459533316428754950  2021-11-13 14:47:14  ...       0        1\n",
              "\n",
              "[6597 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Jwcc9Bwdx0ie"
      },
      "source": [
        "## \"#loneliness\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tfu8ca0Wx1m9"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining6(search_query6, num_tweets6, since_id_num6):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang=\"en\", since_id=since_id_num6, \n",
        "                                                    tweet_mode='extended').items(num_tweets6)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list6[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:\n",
        "      csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object\n",
        "      csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 22,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "veyW6kE7z5A0"
      },
      "source": [
        "search_words6 = \"#loneliness\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query6 = search_words6 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0])\n",
        "tweets_mining6(search_query6, 10000, latest_tweet)"
      ],
      "execution_count": 23,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bggxtMrn0EGM"
      },
      "source": [
        "df_loneliness_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "SlXTyO6d0KrH",
        "outputId": "a8a7127b-34e5-437e-effd-a1364ff5bad5"
      },
      "source": [
        "df_loneliness_1"
      ],
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447444376464998400</td>\n",
              "      <td>2021-10-11 06:10:06</td>\n",
              "      <td>Every year passes but the pain remains the sam...</td>\n",
              "      <td>India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447517473679441921</td>\n",
              "      <td>2021-10-11 11:00:33</td>\n",
              "      <td>In this life, I can't expect things to be in m...</td>\n",
              "      <td>Davao Region</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447540227422162949</td>\n",
              "      <td>2021-10-11 12:30:58</td>\n",
              "      <td>holidays can bring on a sense of loss - of fam...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447564113928863744</td>\n",
              "      <td>2021-10-11 14:05:53</td>\n",
              "      <td>Must be good to have someone by your side. #Lo...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447599325304000515</td>\n",
              "      <td>2021-10-11 16:25:48</td>\n",
              "      <td>#Artists without an air of #loneliness , are #...</td>\n",
              "      <td>Sulaimanyah, Kurdistan</td>\n",
              "      <td>0</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>306</th>\n",
              "      <td>1459371193283362820</td>\n",
              "      <td>2021-11-13 04:03:00</td>\n",
              "      <td>I want someone who loves to take nighttime dri...</td>\n",
              "      <td>North Carolina, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>307</th>\n",
              "      <td>1459473286836989959</td>\n",
              "      <td>2021-11-13 10:48:41</td>\n",
              "      <td>I have apparently reached the point of #autist...</td>\n",
              "      <td>South West, England</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>308</th>\n",
              "      <td>1459491234473553921</td>\n",
              "      <td>2021-11-13 12:00:00</td>\n",
              "      <td>Give us a call. Need any advice with #covid19 ...</td>\n",
              "      <td>Dublin City, Ireland</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>309</th>\n",
              "      <td>1459495762908401664</td>\n",
              "      <td>2021-11-13 12:18:00</td>\n",
              "      <td>fob lyrics trying so hard to be someone you’re...</td>\n",
              "      <td>she/they • 18 • scorpio</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>310</th>\n",
              "      <td>1459513880527441920</td>\n",
              "      <td>2021-11-13 13:30:00</td>\n",
              "      <td>Give us a call. Need any advice with #covid19 ...</td>\n",
              "      <td>Dublin City, Ireland</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>311 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                tweet.id           created_at  ... retweet favorite\n",
              "0    1447444376464998400  2021-10-11 06:10:06  ...       0        0\n",
              "1    1447517473679441921  2021-10-11 11:00:33  ...       0        0\n",
              "2    1447540227422162949  2021-10-11 12:30:58  ...       0        0\n",
              "3    1447564113928863744  2021-10-11 14:05:53  ...       0        0\n",
              "4    1447599325304000515  2021-10-11 16:25:48  ...       0        5\n",
              "..                   ...                  ...  ...     ...      ...\n",
              "306  1459371193283362820  2021-11-13 04:03:00  ...       0        0\n",
              "307  1459473286836989959  2021-11-13 10:48:41  ...       0        1\n",
              "308  1459491234473553921  2021-11-13 12:00:00  ...       1        1\n",
              "309  1459495762908401664  2021-11-13 12:18:00  ...       0        1\n",
              "310  1459513880527441920  2021-11-13 13:30:00  ...       0        0\n",
              "\n",
              "[311 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QnHoDxZ70SnD"
      },
      "source": [
        "## \"#itsokaynottobeokay\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WtQHpt-c0Te1"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining7(search_query7, num_tweets7, since_id_num7):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang=\"en\", since_id=since_id_num7, \n",
        "                                                    tweet_mode='extended').items(num_tweets7)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list7[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:\n",
        "      csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object\n",
        "      csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 26,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TP-dBQTL1vkD"
      },
      "source": [
        "search_words7 = \"#itsokaynottobeokay\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query7 = search_words7 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining7(search_query7, 2000, latest_tweet)"
      ],
      "execution_count": 27,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IEyjMy_B2hc7"
      },
      "source": [
        "df_itsok_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 28,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "GD5zNft02yGK",
        "outputId": "22900167-41bb-4c8a-ca74-80db5d1a70e5"
      },
      "source": [
        "df_itsok_1"
      ],
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447444376464998400</td>\n",
              "      <td>2021-10-11 06:10:06</td>\n",
              "      <td>Every year passes but the pain remains the sam...</td>\n",
              "      <td>India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447517473679441921</td>\n",
              "      <td>2021-10-11 11:00:33</td>\n",
              "      <td>In this life, I can't expect things to be in m...</td>\n",
              "      <td>Davao Region</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447540227422162949</td>\n",
              "      <td>2021-10-11 12:30:58</td>\n",
              "      <td>holidays can bring on a sense of loss - of fam...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447564113928863744</td>\n",
              "      <td>2021-10-11 14:05:53</td>\n",
              "      <td>Must be good to have someone by your side. #Lo...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447599325304000515</td>\n",
              "      <td>2021-10-11 16:25:48</td>\n",
              "      <td>#Artists without an air of #loneliness , are #...</td>\n",
              "      <td>Sulaimanyah, Kurdistan</td>\n",
              "      <td>0</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>160</th>\n",
              "      <td>1459084076250546178</td>\n",
              "      <td>2021-11-12 09:02:06</td>\n",
              "      <td>Every problem has a solution if you don’t know...</td>\n",
              "      <td>South East, England</td>\n",
              "      <td>0</td>\n",
              "      <td>10</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>161</th>\n",
              "      <td>1459236894219325441</td>\n",
              "      <td>2021-11-12 19:09:21</td>\n",
              "      <td>I'm loving @calumscott new song, definitely me...</td>\n",
              "      <td>Wrexham, Wales</td>\n",
              "      <td>0</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>162</th>\n",
              "      <td>1459270946485719041</td>\n",
              "      <td>2021-11-12 21:24:40</td>\n",
              "      <td>You ever stop to acknowledge :  would you look...</td>\n",
              "      <td>United States</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>163</th>\n",
              "      <td>1459429100180111361</td>\n",
              "      <td>2021-11-13 07:53:07</td>\n",
              "      <td>i became teume bcoz of “ #itsokaynottobeokay ”...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>164</th>\n",
              "      <td>1459458776092786694</td>\n",
              "      <td>2021-11-13 09:51:02</td>\n",
              "      <td>I don't usually do this but I just want to tha...</td>\n",
              "      <td>Leicester, England</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>165 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                tweet.id           created_at  ... retweet favorite\n",
              "0    1447444376464998400  2021-10-11 06:10:06  ...       0        0\n",
              "1    1447517473679441921  2021-10-11 11:00:33  ...       0        0\n",
              "2    1447540227422162949  2021-10-11 12:30:58  ...       0        0\n",
              "3    1447564113928863744  2021-10-11 14:05:53  ...       0        0\n",
              "4    1447599325304000515  2021-10-11 16:25:48  ...       0        5\n",
              "..                   ...                  ...  ...     ...      ...\n",
              "160  1459084076250546178  2021-11-12 09:02:06  ...       0       10\n",
              "161  1459236894219325441  2021-11-12 19:09:21  ...       0        3\n",
              "162  1459270946485719041  2021-11-12 21:24:40  ...       0        2\n",
              "163  1459429100180111361  2021-11-13 07:53:07  ...       0        0\n",
              "164  1459458776092786694  2021-11-13 09:51:02  ...       0        1\n",
              "\n",
              "[165 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 29
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-RXWp6HY44nN"
      },
      "source": [
        "## \"#depression\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pbZltJ-k45d5"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining8(search_query8, num_tweets8, since_id_num8):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang=\"en\", since_id=since_id_num8, \n",
        "                                                    tweet_mode='extended').items(num_tweets8)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list8[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:\n",
        "      csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object\n",
        "      csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 30,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ghHhnfIO5xMg"
      },
      "source": [
        "search_words8 = \"#depression\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query8 = search_words8 + \" -filter:links AND -filter:retweets AND -filter:replies\"\n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining8(search_query8, 1000, latest_tweet)"
      ],
      "execution_count": 31,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2tZbCrCQ6BKL"
      },
      "source": [
        "df_depression_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 32,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "9vYE-YWt6hsd",
        "outputId": "172ccd9a-eb04-4617-eb09-b7bb421126c9"
      },
      "source": [
        "df_depression_1"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447381882828623879</td>\n",
              "      <td>2021-10-11 02:01:46</td>\n",
              "      <td>#letstalk many suffering from #depression and ...</td>\n",
              "      <td>Chicago, IL</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447387707362131970</td>\n",
              "      <td>2021-10-11 02:24:55</td>\n",
              "      <td>#Harassmentatwork can lead to debilitating men...</td>\n",
              "      <td>Lahore</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447396592877805570</td>\n",
              "      <td>2021-10-11 03:00:13</td>\n",
              "      <td>So . . . my #therapist called my wife and told...</td>\n",
              "      <td>If it makes a difference, ask.</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447398472735342600</td>\n",
              "      <td>2021-10-11 03:07:41</td>\n",
              "      <td>#psychology #love #mentalhealth #therapy #heal...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447400177510146062</td>\n",
              "      <td>2021-10-11 03:14:28</td>\n",
              "      <td>#psychology #love #mentalhealth #therapy #heal...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4478</th>\n",
              "      <td>1459517445736124420</td>\n",
              "      <td>2021-11-13 13:44:10</td>\n",
              "      <td>I've literally cried atleast once a day for th...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4479</th>\n",
              "      <td>1459521433193877511</td>\n",
              "      <td>2021-11-13 14:00:00</td>\n",
              "      <td>Black cohosh (Cimicifuga racemosa) is a partic...</td>\n",
              "      <td>Global</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4480</th>\n",
              "      <td>1459527712775847936</td>\n",
              "      <td>2021-11-13 14:24:58</td>\n",
              "      <td>I mention therapy to him today, his response \"...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4481</th>\n",
              "      <td>1459531002276192263</td>\n",
              "      <td>2021-11-13 14:38:02</td>\n",
              "      <td>Finna go to dollar tree and get some organizin...</td>\n",
              "      <td>Dallas Texas, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4482</th>\n",
              "      <td>1459532763942604800</td>\n",
              "      <td>2021-11-13 14:45:02</td>\n",
              "      <td>I discovered today that clothes shopping is a ...</td>\n",
              "      <td>England, United Kingdom</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>4483 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                 tweet.id           created_at  ... retweet favorite\n",
              "0     1447381882828623879  2021-10-11 02:01:46  ...       0        0\n",
              "1     1447387707362131970  2021-10-11 02:24:55  ...       1        1\n",
              "2     1447396592877805570  2021-10-11 03:00:13  ...       0        0\n",
              "3     1447398472735342600  2021-10-11 03:07:41  ...       1        0\n",
              "4     1447400177510146062  2021-10-11 03:14:28  ...       1        4\n",
              "...                   ...                  ...  ...     ...      ...\n",
              "4478  1459517445736124420  2021-11-13 13:44:10  ...       0        0\n",
              "4479  1459521433193877511  2021-11-13 14:00:00  ...       1        1\n",
              "4480  1459527712775847936  2021-11-13 14:24:58  ...       0        1\n",
              "4481  1459531002276192263  2021-11-13 14:38:02  ...       0        0\n",
              "4482  1459532763942604800  2021-11-13 14:45:02  ...       0        1\n",
              "\n",
              "[4483 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 33
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iaBSFYwsUPaI",
        "outputId": "7b2a0935-671f-4d94-d364-5dc7a7134e12"
      },
      "source": [
        "## Finding unique values in each column\n",
        "for col in df_depression_1:\n",
        "    print(\"There are \", len(df_depression_1[col].unique()), \"unique values in \", col)"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are  3185 unique values in  tweet.id\n",
            "There are  3182 unique values in  created_at\n",
            "There are  2818 unique values in  text\n",
            "There are  939 unique values in  location\n",
            "There are  23 unique values in  retweet\n",
            "There are  59 unique values in  favorite\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N2ZER9SmTPzF"
      },
      "source": [
        "## \"#sad\""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EWSDmH8s6iuZ"
      },
      "source": [
        "## Create a function for tweets mining\n",
        "def tweets_mining9(search_query9, num_tweets9, since_id_num9):\n",
        "  # Collect tweets using the Cursor object\n",
        "  # Each item in the iterator has various attributes that you can access to get information about each tweet\n",
        "  tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang=\"en\", since_id=since_id_num9, \n",
        "                                                    tweet_mode='extended').items(num_tweets9)]\n",
        "  \n",
        "  # Begin scraping the tweets individually:\n",
        "  for tweet in tweet_list9[::-1]:\n",
        "    tweet_id = tweet.id # get Tweet ID result\n",
        "    created_at = tweet.created_at # get time tweet was created\n",
        "    text = tweet.full_text # retrieve full tweet text\n",
        "    location = tweet.user.location # retrieve user location\n",
        "    retweet = tweet.retweet_count # retrieve number of retweets\n",
        "    favorite = tweet.favorite_count # retrieve number of likes\n",
        "    with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:\n",
        "      csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object\n",
        "      csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row"
      ],
      "execution_count": 34,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5G-4-YnoUAVZ"
      },
      "source": [
        "search_words9 = \"#sad\" # Specifying exact phrase to search\n",
        "# Exclude Links, retweets, replies\n",
        "search_query9 = search_words9 + \" -filter:links AND -filter:retweets AND -filter:replies\" \n",
        "with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:\n",
        "    latest_tweet = int(list(csv.reader(data))[-1][0]) \n",
        "tweets_mining9(search_query9, 2000, latest_tweet)"
      ],
      "execution_count": 35,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6ivTsYufUKw2"
      },
      "source": [
        "df_sad_1 = pd.read_csv(\"/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv\",\n",
        "                 names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"])"
      ],
      "execution_count": 36,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "4TjbnQlJUUbA",
        "outputId": "4ab3eb84-3d0c-444b-fa61-6d0b9969d3d2"
      },
      "source": [
        "df_sad_1"
      ],
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447386915502792706</td>\n",
              "      <td>2021-10-11 02:21:46</td>\n",
              "      <td>Tried to propose to Todd with an air ring duri...</td>\n",
              "      <td>MD/DC</td>\n",
              "      <td>0</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447389433553096704</td>\n",
              "      <td>2021-10-11 02:31:46</td>\n",
              "      <td>Forgetting to bring a post game pint to pickup...</td>\n",
              "      <td>Canada</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447390726132625416</td>\n",
              "      <td>2021-10-11 02:36:54</td>\n",
              "      <td>bro wtf i came to school because of him and he...</td>\n",
              "      <td>she / her | cbyf !!</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1447390741706149895</td>\n",
              "      <td>2021-10-11 02:36:58</td>\n",
              "      <td>I agree with @clint_dempsey on the Yanks not w...</td>\n",
              "      <td>Los Angeles, CA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1447391562380554244</td>\n",
              "      <td>2021-10-11 02:40:14</td>\n",
              "      <td>The amount of people who do not tip for grocer...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3517</th>\n",
              "      <td>1459521498842992642</td>\n",
              "      <td>2021-11-13 14:00:16</td>\n",
              "      <td>Just got banned from a server F #sad</td>\n",
              "      <td>Jakarta Capital Region</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3518</th>\n",
              "      <td>1459521611997003777</td>\n",
              "      <td>2021-11-13 14:00:43</td>\n",
              "      <td>I literally cried during my exam and the cam i...</td>\n",
              "      <td>بيت أمك</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3519</th>\n",
              "      <td>1459524263946326017</td>\n",
              "      <td>2021-11-13 14:11:15</td>\n",
              "      <td>No one can be happy with a guy like me. That's...</td>\n",
              "      <td>Varanasi, Uttar Pradesh, India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3520</th>\n",
              "      <td>1459530315437785095</td>\n",
              "      <td>2021-11-13 14:35:18</td>\n",
              "      <td>arrived at my house but Am I Home? #deep #sad ...</td>\n",
              "      <td>they19sea</td>\n",
              "      <td>1</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3521</th>\n",
              "      <td>1459530643591905284</td>\n",
              "      <td>2021-11-13 14:36:36</td>\n",
              "      <td>Being spoken down to rn at @starbucks and reme...</td>\n",
              "      <td>Night Vale, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>3522 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                 tweet.id           created_at  ... retweet favorite\n",
              "0     1447386915502792706  2021-10-11 02:21:46  ...       0        4\n",
              "1     1447389433553096704  2021-10-11 02:31:46  ...       0        1\n",
              "2     1447390726132625416  2021-10-11 02:36:54  ...       0        0\n",
              "3     1447390741706149895  2021-10-11 02:36:58  ...       0        0\n",
              "4     1447391562380554244  2021-10-11 02:40:14  ...       0        1\n",
              "...                   ...                  ...  ...     ...      ...\n",
              "3517  1459521498842992642  2021-11-13 14:00:16  ...       0        1\n",
              "3518  1459521611997003777  2021-11-13 14:00:43  ...       0        0\n",
              "3519  1459524263946326017  2021-11-13 14:11:15  ...       0        0\n",
              "3520  1459530315437785095  2021-11-13 14:35:18  ...       1        3\n",
              "3521  1459530643591905284  2021-11-13 14:36:36  ...       0        0\n",
              "\n",
              "[3522 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 37
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WMQTcPwD38hP"
      },
      "source": [
        "# Combining all the tweets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aGjcg4Et6ZR9"
      },
      "source": [
        "import glob"
      ],
      "execution_count": 38,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 285
        },
        "id": "FVBUCENZ4BIQ",
        "outputId": "e06fbce1-e125-4ff4-c763-b128e9acf2ea"
      },
      "source": [
        "path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API'  # use your path\n",
        "all_files = glob.glob(path + \"/*.csv\")\n",
        "\n",
        "tweets = []\n",
        "\n",
        "for filename in all_files:\n",
        "    df = pd.read_csv(filename, \n",
        "                     names=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]) # Convert each csv to a dataframe\n",
        "    tweets.append(df)\n",
        "\n",
        "tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes\n",
        "#tweets_df.columns=['tweet.id', \"created_at\",\"text\", \"location\", \"retweet\", \"favorite\"]\n",
        "tweets_df.head()"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447537898572574730</td>\n",
              "      <td>2021-10-11 12:21:43</td>\n",
              "      <td>Open discussion. Between the Transfer Portal a...</td>\n",
              "      <td>Cheyenne Wyoming</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447540582490988553</td>\n",
              "      <td>2021-10-11 12:32:23</td>\n",
              "      <td>Plenty of things are changing in my life and t...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447807717859491842</td>\n",
              "      <td>2021-10-12 06:13:53</td>\n",
              "      <td>I feel a little hopeless. Anyone else? #hopele...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1448076026219692033</td>\n",
              "      <td>2021-10-13 00:00:03</td>\n",
              "      <td>Which is more healthy? Hope, or hopelessness? ...</td>\n",
              "      <td>Denver, CO</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1448382047375040513</td>\n",
              "      <td>2021-10-13 20:16:04</td>\n",
              "      <td>So someone tell me how do I get over #HOPELESS...</td>\n",
              "      <td>Portland Or .</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "              tweet.id           created_at  ... retweet favorite\n",
              "0  1447537898572574730  2021-10-11 12:21:43  ...       0        0\n",
              "1  1447540582490988553  2021-10-11 12:32:23  ...       0        0\n",
              "2  1447807717859491842  2021-10-12 06:13:53  ...       0        0\n",
              "3  1448076026219692033  2021-10-13 00:00:03  ...       0        0\n",
              "4  1448382047375040513  2021-10-13 20:16:04  ...       0        2\n",
              "\n",
              "[5 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 39
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 581
        },
        "id": "NIh6Pc_C5BmN",
        "outputId": "6ceba47d-7e76-49e4-f459-8b78860e6aae"
      },
      "source": [
        "tweets_df"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tweet.id</th>\n",
              "      <th>created_at</th>\n",
              "      <th>text</th>\n",
              "      <th>location</th>\n",
              "      <th>retweet</th>\n",
              "      <th>favorite</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1447537898572574730</td>\n",
              "      <td>2021-10-11 12:21:43</td>\n",
              "      <td>Open discussion. Between the Transfer Portal a...</td>\n",
              "      <td>Cheyenne Wyoming</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1447540582490988553</td>\n",
              "      <td>2021-10-11 12:32:23</td>\n",
              "      <td>Plenty of things are changing in my life and t...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1447807717859491842</td>\n",
              "      <td>2021-10-12 06:13:53</td>\n",
              "      <td>I feel a little hopeless. Anyone else? #hopele...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1448076026219692033</td>\n",
              "      <td>2021-10-13 00:00:03</td>\n",
              "      <td>Which is more healthy? Hope, or hopelessness? ...</td>\n",
              "      <td>Denver, CO</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1448382047375040513</td>\n",
              "      <td>2021-10-13 20:16:04</td>\n",
              "      <td>So someone tell me how do I get over #HOPELESS...</td>\n",
              "      <td>Portland Or .</td>\n",
              "      <td>0</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24142</th>\n",
              "      <td>1459521498842992642</td>\n",
              "      <td>2021-11-13 14:00:16</td>\n",
              "      <td>Just got banned from a server F #sad</td>\n",
              "      <td>Jakarta Capital Region</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24143</th>\n",
              "      <td>1459521611997003777</td>\n",
              "      <td>2021-11-13 14:00:43</td>\n",
              "      <td>I literally cried during my exam and the cam i...</td>\n",
              "      <td>بيت أمك</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24144</th>\n",
              "      <td>1459524263946326017</td>\n",
              "      <td>2021-11-13 14:11:15</td>\n",
              "      <td>No one can be happy with a guy like me. That's...</td>\n",
              "      <td>Varanasi, Uttar Pradesh, India</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24145</th>\n",
              "      <td>1459530315437785095</td>\n",
              "      <td>2021-11-13 14:35:18</td>\n",
              "      <td>arrived at my house but Am I Home? #deep #sad ...</td>\n",
              "      <td>they19sea</td>\n",
              "      <td>1</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24146</th>\n",
              "      <td>1459530643591905284</td>\n",
              "      <td>2021-11-13 14:36:36</td>\n",
              "      <td>Being spoken down to rn at @starbucks and reme...</td>\n",
              "      <td>Night Vale, USA</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>24147 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "                  tweet.id           created_at  ... retweet favorite\n",
              "0      1447537898572574730  2021-10-11 12:21:43  ...       0        0\n",
              "1      1447540582490988553  2021-10-11 12:32:23  ...       0        0\n",
              "2      1447807717859491842  2021-10-12 06:13:53  ...       0        0\n",
              "3      1448076026219692033  2021-10-13 00:00:03  ...       0        0\n",
              "4      1448382047375040513  2021-10-13 20:16:04  ...       0        2\n",
              "...                    ...                  ...  ...     ...      ...\n",
              "24142  1459521498842992642  2021-11-13 14:00:16  ...       0        1\n",
              "24143  1459521611997003777  2021-11-13 14:00:43  ...       0        0\n",
              "24144  1459524263946326017  2021-11-13 14:11:15  ...       0        0\n",
              "24145  1459530315437785095  2021-11-13 14:35:18  ...       1        3\n",
              "24146  1459530643591905284  2021-11-13 14:36:36  ...       0        0\n",
              "\n",
              "[24147 rows x 6 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 40
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Yia0nXGnQsiV"
      },
      "source": [
        "tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')"
      ],
      "execution_count": 41,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Zvj3hdFwO2IO"
      },
      "source": [
        "## Data cleaning"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GEBn1OyhPDp1"
      },
      "source": [
        "Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zgrxs9HGOhnN",
        "outputId": "f8886c9b-28b7-4429-ebe0-b91ad894f32b"
      },
      "source": [
        "tweets_df.shape #Get number of rows and columns"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(24147, 6)"
            ]
          },
          "metadata": {},
          "execution_count": 42
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 232
        },
        "id": "s6rb-N77QIA-",
        "outputId": "ae758d07-1cbc-4bc8-988f-8f38777ac201"
      },
      "source": [
        "## Check the data type of each column\n",
        "tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>data_type</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>tweet.id</th>\n",
              "      <td>int64</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>created_at</th>\n",
              "      <td>object</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>text</th>\n",
              "      <td>object</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>location</th>\n",
              "      <td>object</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>retweet</th>\n",
              "      <td>int64</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>favorite</th>\n",
              "      <td>int64</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "           data_type\n",
              "tweet.id       int64\n",
              "created_at    object\n",
              "text          object\n",
              "location      object\n",
              "retweet        int64\n",
              "favorite       int64"
            ]
          },
          "metadata": {},
          "execution_count": 43
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mYuqjbWiPJVK",
        "outputId": "997390f8-38b1-41d6-a94d-d3b25ba402c4"
      },
      "source": [
        "## Finding unique values in each column\n",
        "for col in tweets_df:\n",
        "    print(\"There are \", len(tweets_df[col].unique()), \"unique values in \", col)"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "There are  18190 unique values in  tweet.id\n",
            "There are  18071 unique values in  created_at\n",
            "There are  17107 unique values in  text\n",
            "There are  4648 unique values in  location\n",
            "There are  74 unique values in  retweet\n",
            "There are  159 unique values in  favorite\n"
          ]
        }
      ]
    }
  ]
}