PredictIn/function_for_input.py at main · caciitg/PredictIn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')


from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings
from torch.nn import CosineSimilarity
import torch

cs = CosineSimilarity()
emb=SentenceTransformerDocumentEmbeddings('stsb-roberta-base')
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from unidecode import unidecode
import string
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
def pre_process(corpus):
    # convert input corpus to lower case.
    corpus = corpus.lower()
    # collecting a list of stop words from nltk and punctuation form
    # string class and create single array.
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    corpus=remove_emoji(corpus)
    # word_tokenize is used to tokenize the input corpus in word tokens.
    corpus = " ".join([i for i in word_tokenize(corpus) if i not in stopset])
    # remove non-ascii characters
    corpus = unidecode(corpus)
    return corpus

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B/glove.6B.50d.txt'
word2vec_output_file = 'word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'word2vec.txt'
word_emb_model = KeyedVectors.load_word2vec_format(filename, binary=False)


from collections import Counter
import itertools

def map_word_frequency(document):
    return Counter(itertools.chain(*document))

def get_sif_feature_vectors(sentence1, sentence2, word_emb_model=word_emb_model):
    sentence1 = [token for token in sentence1.split() if token in word_emb_model.wv.vocab]
    sentence2 = [token for token in sentence2.split() if token in word_emb_model.wv.vocab]
    word_counts = map_word_frequency((sentence1 + sentence2))
    embedding_size = 50 # size of vectore in word embeddings
    a = 0.001
    sentence_set=[]
    for sentence in [sentence1, sentence2]:
        vs = np.zeros(embedding_size)
        sentence_length = len(sentence)
        for word in sentence:
            a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
            vs = np.add(vs, np.multiply(a_value, word_emb_model.wv[word])) # vs += sif * word_vector
        vs = np.divide(vs, sentence_length) # weighted average
        sentence_set.append(vs)
    return sentence_set

def get_feature_vectors(sentence1, sentence2, word_emb_model=word_emb_model):
    sentence1 = [token for token in sentence1.split() if token in word_emb_model.wv.vocab]
    sentence2 = [token for token in sentence2.split() if token in word_emb_model.wv.vocab]
    word_counts = map_word_frequency((sentence1 + sentence2))
    embedding_size = 50 # size of vectore in word embeddings

    sentence_set=[]
    for sentence in [sentence1, sentence2]:
        vs = np.zeros(embedding_size)
        sentence_length = len(sentence)
        for word in sentence:
            a_value = 1 # smooth inverse frequency, SIF
            vs = np.add(vs, np.multiply(a_value, word_emb_model.wv[word])) # vs += sif * word_vector
        vs = np.divide(vs, sentence_length) # weighted average
        sentence_set.append(vs)
    return sentence_set

from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]


def input_variables(post_content,about,headline):
    text=post_content.lower()

    numb_hashtags=len([tag.strip("#") for tag in text.split() if tag.startswith("#")])
    #classification
    labels = {"achievement":["achievement"],"info":["knowledge and facts","updates and announcements"],"insights":["insightful experiences and life lessons"],"job opening":["recruiting"],"call to action":["share with us in comments what are your opinions preferences and suggestions"]}
    label_emb = {}
    for lab in labels:
        label_emb[lab] = []
        for labi in labels[lab]:
            sen = Sentence(labi)
            emb.embed(sen)
            label_emb[lab].append(sen.embedding.reshape(1,-1))
    x=post_content
    sen = Sentence(x)
    emb.embed(sen)
    sen_emb = sen.embedding.reshape(1,-1)

    label_sims = {}

    for lab in label_emb:
        simi = 0
        for embd in label_emb[lab]:
            simi+=cs(embd,sen_emb)
        simi = simi/len(label_emb[lab])
        label_sims[lab] = simi

    max_lab = "other"
    max_sim = 0

    for lab in label_emb:
        if(label_sims[lab]>max_sim):
            max_sim = label_sims[lab].item()
            max_lab = lab
    post_type=max_lab
    confidence=max_sim
    #convert tweets to lower case


#url removes
    text=text.replace(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b','')
    text=text.replace(r'www\.\S+\.com','')

#removes retweets & cc
    text=text.replace(r'rt|cc', '')

#hashtags removes
    text=text.replace(r'#', '')

#user mention removes
    text=text.replace(r'@\S+', '')

#emoji
    text=text.replace(r'[^\x00-\x7F]+', '')

#html tags
    text=text.replace(r'<.*?>', '')

#punctuation
    text=text.replace('[{}]'.format(string.punctuation), '')

#removes extra spaces
    text=text.replace(r'( )+', ' ').strip()


#relevance score
    info=about+headline
    sentence_1=info
    sentence_2=text

    sentence_1= pre_process(sentence_1)
    sentence_2= pre_process(sentence_2)

    sentence_set=get_feature_vectors(sentence_1, sentence_2, word_emb_model=word_emb_model)
    if np.isnan(sentence_set[0]).any():
        similarity=0
    elif np.isnan(sentence_set[1]).any():
        similarity=0

    else:
        similarity=get_cosine_similarity(sentence_set[0], sentence_set[1])
    relevance_score=similarity
    content_len=len(text)

    return numb_hashtags,content_len,relevance_score,post_type,confidence