File size: 2,804 Bytes
c061ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Utility for cleaning raw tweet data for analysis.
# ==============================================================================

import argparse
import warnings
import clean_utilities as CU

# Suppression of non-critical runtime warnings to ensure output clarity
warnings.filterwarnings("ignore")

def main():
    """
    Primary execution routine for the tweet cleaning utility.
    
    This script facilitates the transformation of raw unstructured text 
    into a standardized format, essential for downstream machine learning 
    inference and training.
    """
    # Configuration of the command-line argument parser
    parser = argparse.ArgumentParser(
        description="Twitter Depression Detection: Text Cleaning Utility"
    )

    # Definition of the mandatory positional argument for input file path
    parser.add_argument(
        'filename', 
        help="Path to the raw text file containing the tweet to be sanitized"
    )

    # Parsing and validation of terminal arguments
    args = parser.parse_args()

    # Conditional logic to verify input availability before processing
    if args.filename is not None:
        print(f"Targeting file for preprocessing: {args.filename}")
        
        try:
            # Atomic read operation for the target text file
            with open(args.filename, 'r', encoding='utf-8') as file:
                raw_tweet = file.read()
                
                # Invocation of the granular cleaning pipeline
                # Methodology includes contraction expansion, tokenization, and lemmatization
                print("Linguistic cleaning in progress...")
                sanitized_tweet = CU.tweets_cleaner(raw_tweet)
                
                # Persisting the sanitized result to local storage
                with open('clean_tweet.txt', 'w', encoding='utf-8') as output_file:
                    print("Sanitization complete. Persistence target: clean_tweet.txt")
                    output_file.write(sanitized_tweet)
                    
        except FileNotFoundError:
            print(f"Error: The specified file '{args.filename}' was not discovered.")
        except Exception as e:
            print(f"An unexpected analytical error occurred: {e}")
            
    else:
        print("Required input: Please specify a valid filename as a positional argument.")

if __name__ == '__main__':
    main()