-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
77 lines (56 loc) · 2.06 KB
/
preprocessing.py
File metadata and controls
77 lines (56 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
def load_train_tweets(name_pos, name_neg):
"""Load the positive and negative tweets for training self.
Returns tweets, and number of tweets of each sentiment
(useful to set prediction values)"""
# List to store the tweets
sentences = []
# Load the positive tweets
with open(name_pos) as pos_f:
for line in pos_f:
sentences.append(line)
# Number of positive tweets
size_pos = len(sentences)
# Load the negative tweets
with open(name_neg) as neg_f:
for line in neg_f:
sentences.append(line)
# Number of negative tweets
size_neg = len(sentences) - size_pos
return sentences, size_pos, size_neg
def load_test_tweets(name):
"""Load the test tweets and take care of the indices"""
# Lists to store the tweets and their ids
sentences = []
ids = []
# Load the tweets and ids
with open(name) as f:
for line in f:
# Split on the first "," to get the id and tweet
id, s = line.split(',', 1)
sentences.append(s)
ids.append(id)
return sentences, ids
def predictions(size_pos, size_neg):
"""Return the list of labels for training tweets"""
# Compute pred using the number of positive and negative tweets
pred = np.concatenate([np.ones(size_pos), np.full(size_neg, -1)])
return pred
def tokenize(tweet, stem=False, remove_stop_words=False):
"""Tokenize a given tweet"""
stop = stopwords.words('english')
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet = tweet.replace('[^\w\s]','')
tokens = tokenizer.tokenize(tweet)
# Option to perform stemming
if stem:
tokens = [stemmer.stem(token) for token in tokens]
# Option to remove stop-words
if remove_stop_words:
tokens = [token for token in tokens if token not in stop]
return tokens