Project2-ML/preprocessing.py at master · Adirlou/Project2-ML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


def load_train_tweets(name_pos, name_neg):
    """Load the positive and negative tweets for training self.
    Returns tweets, and number of tweets of each sentiment
    (useful to set prediction values)"""

    # List to store the tweets
    sentences = []

    # Load the positive tweets
    with open(name_pos) as pos_f:
        for line in pos_f:
            sentences.append(line)

    # Number of positive tweets
    size_pos = len(sentences)

    # Load the negative tweets
    with open(name_neg) as neg_f:
        for line in neg_f:
            sentences.append(line)

    # Number of negative tweets
    size_neg = len(sentences) - size_pos

    return sentences, size_pos, size_neg

def load_test_tweets(name):
    """Load the test tweets and take care of the indices"""

    # Lists to store the tweets and their ids
    sentences = []
    ids = []

    # Load the tweets and ids
    with open(name) as f:
        for line in f:
            # Split on the first "," to get the id and tweet
            id, s = line.split(',', 1)

            sentences.append(s)
            ids.append(id)

    return sentences, ids

def predictions(size_pos, size_neg):
    """Return the list of labels for training tweets"""

    # Compute pred using the number of positive and negative tweets
    pred = np.concatenate([np.ones(size_pos), np.full(size_neg, -1)])

    return pred

def tokenize(tweet, stem=False, remove_stop_words=False):
    """Tokenize a given tweet"""

    stop = stopwords.words('english')
    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet = tweet.replace('[^\w\s]','')
    tokens = tokenizer.tokenize(tweet)

    # Option to perform stemming
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]

    # Option to remove stop-words
    if remove_stop_words:
        tokens = [token for token in tokens if token not in stop]

    return tokens