File size: 4,562 Bytes
4d1cb0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Utility for predicting depression levels in tweets using SVM.
# ==============================================================================

import argparse
import pickle
import warnings
import numpy as np
import pandas as pd
import spacy
import en_core_web_lg
import clean_utilities as CU

# Suppression of non-critical runtime warnings to maintain output integrity
warnings.filterwarnings("ignore")

def main():
    """
    Main entry point for the prediction utility.
    
    This script encapsulates the end-to-end inference pipeline:
        1. Argument Parsing: Captures input text file and model selection.
        2. Text Preprocessing: Normalization via clean_utilities.
        3. Feature Extraction: Generating centroid embeddings via spaCy.
        4. Classification: Binary sentiment analysis via pre-trained SVM.
    """
    # Initialize the CLI argument parser with a descriptive header
    parser = argparse.ArgumentParser(
        description="Twitter Depression Detection: Machine Learning Inference Utility"
    )

    # Positional argument for the target tweet content (text file)
    parser.add_argument(
        'filename', 
        help="Path to the text file containing the tweet for classification"
    )

    # Positional argument for the classification model type
    parser.add_argument(
        'model', 
        help="Target model architecture (currently optimized for 'SVM')"
    )

    # Execution of the parsing logic
    args = parser.parse_args()

    # Pipeline validation: Ensuring input availability and model compatibility
    if args.filename is not None and args.model == "SVM":
        print(f"Loading input source: {args.filename}")
        
        try:
            # Step 1: Data Acquisition
            with open(args.filename, 'r', encoding='utf-8') as file:
                raw_test_tweet = file.read()
                print(f"Captured Content: \"{raw_test_tweet}\"")
                
                # Step 2: Linguistic Preprocessing
                # Normalizes raw discourse into a tokenizable semantic format
                print("Executing linguistic cleaning pipeline...")
                cleaned_input = [CU.tweets_cleaner(raw_test_tweet)]
                print(f"Normalized Form: {cleaned_input}")

            # Step 3: Feature Space Transformation
            # Utilizing dense word embeddings (spaCy 'en_core_web_lg' model)
            print("Transforming text to 300-dimensional semantic vectors...")
            nlp_engine = en_core_web_lg.load()
            
            # Generating the centroid vector representing the tweet's linguistic context
            semantic_features = np.array([
                np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
                for s in cleaned_input
            ])

            # Step 4: Model Artifact Loading
            # Loading the serialized SVM classifier from the assets directory
            model_artifact_path = "../assets/models/model_svm1.pkl"
            with open(model_artifact_path, 'rb') as model_file:
                classifier = pickle.load(model_file)
                
            # Step 5: Algorithmic Inference
            # The SVM determines the classification boundary for the semantic vector
            print("Performing binary classification...")
            prediction_bin = classifier.predict(semantic_features)
            
            # Step 6: Result Interpretation and User Communication
            is_depressive = prediction_bin[0]
            if is_depressive == 1:
                print("\n>>> CLASSIFICATION RESULT: The analyzed content exhibits depressive characteristics.")
            else:
                print("\n>>> CLASSIFICATION RESULT: The analyzed content is classified as non-depressive.")

        except FileNotFoundError:
            print(f"Error: The input file {args.filename} could not be located.")
        except Exception as e:
            print(f"An error occurred during the inference process: {e}")

    else:
        print("Usage Error: Please provide an input file and specify 'SVM' as the target model.")

if __name__ == '__main__':
    main()