This project is maintained by ckshitij
cd package_name
python setup.py install
python
df1 = pd.read_excel(file1,index_col=None, na_values=['NA'], parse_cols = [])
df2 = pd.read_excel(file2,index_col=None, na_values=['NA'], parse_cols = [])
frames = [df1,df2]
df = pd.concat(frames,ignore_index=True)
df.to_csv(mainfile,encoding='utf-8',index = False)
stan_dic = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love", "..."}
def lookup_words(Mail):
words = Mail.split()
word_n = []
for word in words:
if word in stan_dic:
word = stan_dic[word]
word_n.append(word)
an = " ".join(str(x) for x in word_n)
an = an.lower()
del words[:]
del word_n[:]
return an
lookup_words("RT this is a retweeted tweet by Shivam Bansal")
>> "Retweet this is a retweeted tweet by Shivam Bansal"
def tokenize(document):
document = unicode(document,'utf-8')
lemmy = []
for sent in sent_tokenize(document):
for token, tag in pos_tag(wordpunct_tokenize(sent)):
if token in stoplist:
continue
lemma = lemmatize(token, tag)
lemmy.append(lemma)
return lemmy
def lemmatize(token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
lemmatizer = WordNetLemmatizer()
return lemmatizer.lemmatize(token, tag)
vectorizer = TfidfVectorizer(max_df=0.5, max_features=15000, min_df=2, stop_words='english', use_idf=True , ngram_range=(1,3))
Latent Semantic Analysis (LSA) :- It is a technique in natural language processing, in particular distributional semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms.
LSA assumes that words that are close in meaning will occur in similar pieces of text (the distributional hypothesis).
A matrix containing word counts per paragraph (rows represent unique words and columns represent each paragraph) is constructed from a large piece of text and a mathematical technique called singular value decomposition (SVD) is used to reduce the number of rows while preserving the similarity structure among columns.
Words are then compared by taking the cosine of the angle between the two vectors (or the dot product between the normalizations of the two vectors) formed by any two rows.
Values close to 1 represent very similar words while values close to 0 represent very dissimilar words.
svd = TruncatedSVD(200,n_iter=7, random_state=42)
lsa = make_pipeline(vectorizer,svd, Normalizer(copy=False))
clf = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,max_depth=36,class_weight=data_dic)
pipe = make_pipeline(lsa, clf)
pipe.fit(training_data,target)
from sklearn.externals import joblib
filename = 'final_model.pkl'
joblib.dump(clf,filename)
filename = 'tfidf_model.pkl'
joblib.dump(vect,filename)
filename = 'lsa_model.pkl'
joblib.dump(lsa,filename)
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names = class_name)
exp = explainer.explain_instance(doc,pipe.predict_proba,num_features=6,top_labels=4)
exp.show_in_notebook(text = False)
from xgboost.sklearn import XGBClassifier
model = XGBClassifier(objective='multi:softmax',nthread=4,n_estimators=1000,scale_pos_weight=data_dic)
model.fit(Lsa_training_data , target)
# make predictions for test data
y_pred = model.predict(Lsa_test_data)
import tensorflow.contrib.learn as ln
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(Lsa_training_data )
clf = ln.DNNClassifier(hidden_units=[],optimizer = tf.train.RMSPropOptimizer(),dropout=0.2,n_classes=,feature_columns=feature_columns,model_dir="$PATH")
clf.fit(Lsa_training_data ,y_train,batch_size=256,max_steps=40000)