Python Utilities

Exploratory Data Analysis in Python
Building Binary classification model using GBM and H2o Library
Grid Search in H2o
Data connections & Pipelines in Python - Hive2Python and Serial connection
Basic Webscapping using BeautifulSoup library
Topic modelling using LDA
N gram analysis using NLTK

Exploratory Data Analysis in Python

Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns,to spot anomalies,to test hypothesis and to check assumptions with the help of summary statistics and graphical representations

To share my understanding of the concept and techniques I know,I’ll take an example of the famous Titanic dataset

Thanks to Kaggle and encyclopedia-titanica for the dataset

import pandas as pd

df_input = pd.read_csv("titanic.csv",sep=",")

df_input.head(4)

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Fare
0	0	3	Mr. Owen Harris Braund	male	22.0	1	7.2500
1	1	1	Mrs. John Bradley (Florence Briggs Thayer) Cum...	female	38.0	1	71.2833
2	1	3	Miss. Laina Heikkinen	female	26.0	0	7.9250
3	1	1	Mrs. Jacques Heath (Lily May Peel) Futrelle	female	35.0	1	53.1000

In this dataset, the Column “Survived” is the target binary variable which tells if the particular id survived or not

##One can get alot of insights for numeric fields from the describe function of pandas

df_input.describe()

	Survived	Pclass	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
count	887.000000	887.000000	887.000000	887.000000	887.000000	887.00000
mean	0.385569	2.305524	29.471443	0.525366	0.383315	32.30542
std	0.487004	0.836662	14.121908	1.104669	0.807466	49.78204
min	0.000000	1.000000	0.420000	0.000000	0.000000	0.00000
25%	0.000000	2.000000	20.250000	0.000000	0.000000	7.92500
50%	0.000000	3.000000	28.000000	0.000000	0.000000	14.45420
75%	1.000000	3.000000	38.000000	1.000000	0.000000	31.13750
max	1.000000	3.000000	80.000000	8.000000	6.000000	512.32920

## Lets check for unique values for each column

print(df_input.nunique())

Survived                     2
Pclass                       3
Name                       887
Sex                          2
Age                         89
Siblings/Spouses Aboard      7
Parents/Children Aboard      7
Fare                       248
dtype: int64

print(df_input.dtypes)

Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

### You must have noticed that the unique value count for column Pclass is only 3 but its a numeric field.
## Lets convert it into an object dtype so as to treat it as categorical

df_input["Pclass"] = df_input["Pclass"].astype("O") 

print(df_input.dtypes)

Survived                     int64
Pclass                      object
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

def univariate(data):
    #Creating seperate dataframes for categorical and continuous variables
    data_cat = data.select_dtypes(include=[object])    
    data_cont = data.select_dtypes(exclude=[object])

    data_cont_univ = data_cont.describe(percentiles=[.001,.005,.01,.25,.5,.75,.95,.99,.995,.999]).transpose()

    data_cont_univ["blanks"] = data_cont.isna().sum(axis=0) + data_cont.isnull().sum(axis=0)

    data_cont_univ["zeros"] = (data_cont == 0).sum(axis=0)

    data_cat_univ = data_cat.describe().transpose()

    data_cat_univ["blanks"] = data_cat.isna().sum(axis=0) + data_cat.isnull().sum(axis=0)

    data_cat_univ["zeros"] = data_cat.isin(["0","00","000","0000","00000","000000"]).sum(axis=0)

    return data_cat_univ,data_cont_univ

data_cat_univ,data_cont_univ = univariate(df_input)

data_cat_univ.head()

	count	unique	top	freq
Pclass	887	3	3	487
Name	887	887	Mr. Sarkis Lahoud	1
Sex	887	2	male	573

data_cont_univ.head()

	count	mean	std	min	0.1%	0.5%	1%	25%	50%	75%	95%	99%	99.5%	99.9%	max	zeros
Survived	887.0	0.385569	0.487004	0.00	0.0000	0.00	0.0	0.000	0.0000	1.0000	1.00000	1.000000	1.000	1.0000	1.0000	545
Age	887.0	29.471443	14.121908	0.42	0.6415	0.83	1.0	20.250	28.0000	38.0000	55.85000	66.000000	70.285	74.6840	80.0000	0
Siblings/Spouses Aboard	887.0	0.525366	1.104669	0.00	0.0000	0.00	0.0	0.000	0.0000	1.0000	3.00000	5.000000	8.000	8.0000	8.0000	604
Parents/Children Aboard	887.0	0.383315	0.807466	0.00	0.0000	0.00	0.0	0.000	0.0000	0.0000	2.00000	4.000000	5.000	5.1140	6.0000	674
Fare	887.0	32.305420	49.782040	0.00	0.0000	0.00	0.0	7.925	14.4542	31.1375	112.55749	249.600388	263.000	512.3292	512.3292	15

## Lets do the bivariate now

def bivariate(data,ignore_col_list,event):
    
    #Subsetting categorical variables for bivariate analysis
    col_list_tmp = list(df_input.select_dtypes(include=[object]).columns)+[event]
    col_list = [w for w in col_list_tmp if w not in ignore_col_list]
    data_cat = data[col_list]
    
    cols = list(data_cat.columns.values)
    
    #Looping bivariate analysis for all variables and appaending the results in a single dataframe
    #The len()-2 is to exclude the variables for which bivariate is not needed (as per the column sequence)
    appended_data = pd.DataFrame()
    for x in range(0, len(data_cat.columns)-1):
    
        data2 = pd.DataFrame({'1.Variable':data_cat.columns[x],
                              '2.Level':data_cat.groupby(data_cat.columns[x])[event].sum().index,
                              '3.Event':data_cat.groupby(data_cat.columns[x])[event].sum(),
                              '4.Volume':data_cat.groupby(data_cat.columns[x])[event].count(),
                              '5.Rate':((data_cat.groupby(data_cat.columns[x])[event].sum()/data_cat.groupby(data_cat.columns[x])[event].count())*100).round(2)})
    

        appended_data = appended_data.append(data2)
    return appended_data

df_bivariate = bivariate(df_input,['Name'],"Survived")

df_bivariate.head()

	1.Variable	2.Level	3.Event	4.Volume	5.Rate
1	Pclass	1	136	216	62.96
2	Pclass	2	87	184	47.28
3	Pclass	3	119	487	24.44
female	Sex	female	233	314	74.20
male	Sex	male	109	573	19.02

We could now build beautiful plots for these results, infact users can go ahead and change the body of our bivariate function to create python plots

# import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# if using a Jupyter notebook, includue:
%matplotlib inline

### Plotting a Grid of plots for bivariate between different columns in the data.

#Caution : Might be very slow depending upon datasize

sns.pairplot(df_input, hue=”Survived”)

Click here to go to Seaborn documentation.

Using the above plots, we can see how closely two columns are interating with one another

At the same time, the hue based on our dependent variable helps us idenitfy potential features in predicting dependent variable

Building a Binary classification model using GBM in H2o Library

Key Terms used in this section

Gradient Boosting Machine (GBM)

Gradient Boosting Machine (for Regression and Classification) is a forward learning ensemble method. The guiding heuristic is that good predictive results can be obtained through increasingly refined approximations.

H2O Library

H2O is a fully open source, distributed in-memory machine learning platform with linear scalability. H2O supports the most widely used statistical & machine learning algorithms including gradient boosted machines, generalized linear models, deep learning and more.

Grid (Hyperparameter) Search

In a grid search, users specify a set of values for each hyperparameter that they want to search over, and H2O will train a model for every combination of the hyperparameter values. This means that if you have three hyperparameters and you specify 5, 10 and 2 values for each, your grid will contain a total of 5102 = 100 models

Lets take the Titanic Dataset and try to predict the “Survived” Attribute using H2o GBM ML Technique

import h2o
import os
import pandas as pd
import numpy as np
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

try:
    h2o.shutdown()
except:
    try:
        h2o.init()
    except:
        h2o.connect()

train_df = h2o.import_file('titanic.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%

## Pass the target variable to be predicted
y = 'Survived'

### Pass the list of predictors 
x = ['Pclass','Age','Siblings/Spouses Aboard',	'Parents/Children Aboard',	'Fare']

# If target variable is binary, convert it to factor
train_df[y]= train_df[y].asfactor()

## Split the dataset into train and valid frames
train, valid = train_df.split_frame(ratios=[.8], seed=1234)

### Lets define our grid parameters. It is same as running a for loop to generate models from the same ML algorithm but with different hyper parameters

gbm_params2 = {'learn_rate': [i * 0.01 for i in range(4, 5)],
                'max_depth': list(range(11, 12)),
                'sample_rate': [i * 0.1 for i in range(7, 9)],
                'col_sample_rate': [i * 0.1 for i in range(8, 9)],
               'col_sample_rate_per_tree': [i * 0.1 for i in range(7, 8)],
              'learn_rate_annealing':[0.99]
              }

# Search criteria
search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 20, 'seed': 1}

# Train and validate a random grid of GBMs
gbm_grid2 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                          grid_id='gbm_grid',
                          hyper_params=gbm_params2,
                          search_criteria=search_criteria)
    gbm_grid2.train(x=x, y=y,
                training_frame=train,
                validation_frame=valid,
                ntrees=120,
                seed=1,ignore_const_cols = 1)

Exporting the results to a csv file

ModelSummary = pd.DataFrame(columns=['Model_ID','learn_rate', 'Trees', 'Depth', 'Row_Sampling', 'Col_Sampling', 'MTries','NBins', 
                                    'Categorical_Encoding', 'Hist_Type', 'Training_Log_Loss', 'Validation_Log_Loss', 
                                    'Training_AUC', 'Validation_AUC'])

ModelSummary2 = pd.DataFrame(columns=['Model_ID', 'learn_rate','Trees', 'Depth', 'Row_Sampling', 'Col_Sampling', 'MTries','NBins', 
                                    'Categorical_Encoding', 'Hist_Type', 'Training_Log_Loss', 'Validation_Log_Loss', 
                                    'Training_AUC', 'Validation_AUC'])
for Model in gbm_grid2:
    ModelSummary['Model_ID'] = [Model.model_id]
    ModelSummary['learn_rate'] = [Model.actual_params.get('learn_rate')]
    ModelSummary['Trees'] = [Model.actual_params.get('ntrees')]
    ModelSummary['Depth'] = [Model.actual_params.get('max_depth')]
    ModelSummary['Row_Sampling'] = [Model.actual_params.get('sample_rate')]
    ModelSummary['Col_Sampling'] = [Model.actual_params.get('col_sample_rate_per_tree')]
    ModelSummary['MTries'] = [Model.actual_params.get('mtries')]
    ModelSummary['NBins'] = [Model.actual_params.get('nbins')]
    ModelSummary['Categorical_Encoding'] = [Model.actual_params.get('categorical_encoding')]
    ModelSummary['Hist_Type'] = [Model.actual_params.get('histogram_type')]
    ModelSummary['Training_Log_Loss'] = [Model.logloss()]
    ModelSummary['Validation_Log_Loss'] = [Model.logloss(valid=True)]
    ModelSummary['Training_AUC'] = Model.auc()
    ModelSummary['Validation_AUC'] = Model.auc(valid=True)
    ModelSummary2 = ModelSummary2.append(ModelSummary, sort = False)

ModelSummary2.to_csv("gbm_grid_results.csv")

Building data connections between different servers

In building machine learning applications, sometimes the data resides in more than 1 server or disk. Let’s say we have a client server from where we indend to fetch data into our server for data processing and push back results to the client server.

In the above scenario, following steps could be helpful

Sqoop Import Data from server and store its metadata in your server.
Perform data processing and analysis
Sqoop Export output data back to the client server

Let’s see how we can do this in Python

Hive and Python

################# Hive connection in Python using pyhive package 

## In Python, we have this library called pyhive, this can be used to access a Hive table residing in a HDFS location
import pandas as pd
from pyhive import hive
import time

statement = '''SELECT * from dbo.tablename limit 10'''


print("Setting up connection")

### In the connect function, we will establish the connection between the hive database and our Python Kernel
conn = hive.connect(host='<server name>',\
                    port=<port number>,\
                    auth='LDAP',\
                    username='<id>',\
                    password="<password?>")

print("Connection successful")

start1=time.time()
start=time.ctime()
print( "started at:"),start

cur = conn.cursor()
#### The below command will fetch the results for the query named "statement"
cur.execute(statement)
df = cur.fetchall()
### The output of the fetchall returns a List and doesnt contain the column names. So we will have to pick up the schema 

### The below command will pull the column names as a list
cur.execute("show columns in dbo.tablename")
df1 = cur.fetchall()


col_names=[w[0] for w in df1]
print( "Data fetch completed at:"),time.ctime()
print(  "Time taken: "),time.time()-start1


print("output saved as a pandas df")

df_final=pd.DataFrame(df,columns=col_names)

SSH Connection in Python

Using Paramiko to establish SSH connection in Python

################ Serial connection in Python 

Sometimes ,we intend to establish a serial data connection capable of transmitting datastream like this 100000000000100000000010000

### Loading the package paramiko for establishing serial connection SSH

import paramiko

print("Initiate a serial SSH client handle")

ssh_client =paramiko.SSHClient()

### Below command is an optional command required
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())

### Below command actually creates the connection with the UNIX

ssh_client.connect(hostname="<server location>",username="<id>",password="<passwordhere>")

handle = ssh_client.open_sftp()

######### Reading file from HDFS location into python session

file_path =handle.file("<remote server location to be read>")

data_read= pd.read_csv(file_path)

### Write the file to local location

data_read.to_csv("testinfile.csv")

#### Writing back file from domino location to HDFS location 

handle.put("testinfile.csv","<remote server location where we want to write back")

Note that SSH connection is very very slow since its a serial connection and transmits data bit by bit.

So, please use it for data transmission for only small datatsets.

For big datasets, use jdbc or odbc connection in Python or R

Web-Scrapping in Python

Here we will cover different libraries in python which can be used in scrappinng data from public websites

Put on your hacking cap and lets break the myths and jazziness around web scrapping

Important notes before scrapping data from any website

Please check in advance if its legal to scrap data from that website by inspecting a few compliance related aspects using robot.txt
Inspect the structure of the website and analyse how the flow and schema of that website looks like
always improvise while fetching desired data.

### requests library is used for fetching the dta from a html page
import requests
import urllib.request
import time
## BeautifulSoup is a very cool library, you can also check out scrapy library ( highly autmoated and parameterized )
## to study more about this lib, refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup

## Target URL
url = 'http://web.mta.info/developers/turnstile.html'

response = requests.get(url)

## if the previuos command was successful, then 

soup = BeautifulSoup(response.text, “html.parser”)

## in scrapping, the whole intention is to pull labels satisfying either a string or a regex
## here i am planning to pull all labels containing "coders"

all_tags = soup.findAll('coders')

##This code gives us every line of code that has an <coders> tag in form of a "list"

# now lets try to actually reach out to some underlying url with tag "a"

single_tag = all_tags[12]
link = single_tag["href"]

download_url = 'http://web.mta.info/developers/'+ link
urllib.request.urlretrieve(download_url,'./’'link[link.find('/turnstile_')+1:]) 

note that a request command could take from few ms to to a few seconds, so when you try to run a loop, do keep sleep commands in the code

Topic modelling & N-Gram in Python

import pandas as pd
import re
import numpy as np
import matplotlib
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize

LDA For Topic Modelling

LDA is an iterative algorithm. Here are the two main steps:

In the Initialization stage, each word is assigned to a random topic.
Iteratively, the algorithm goes through each word and reassigns the word to a topic taking into consideration

a. What’s the probability of the word belonging to a topic

b. What’s the probaility of a document to be generated by a topic

########## Data cleaning for Topic modelling 
def data_cleaning(tweet,custom_list):
    tweet = re.sub(r'\$\w*','',tweet) # Remove tickers
    tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks
    tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's
    #stop = set(stopwords.words('english'))
    tweet = re.sub(r'[^a-zA-Z0-9]'," ",tweet)
    stop_words=set(['a',    'about', 'above', 'after',   'again',  'against',              'ain',      'all',        'am',               'an',       'and',     'any',     'are',      'as',        'at',        'be',       'because',            'been',   'before',               'being',  'below', 'between',           'both',   'but',      'by',        'can',     'couldn',               'd',               'did',      'didn',    'do',       'does',   'doesn', 'doing',  'don',     'down',  'during',               'each',               'few',     'for',      'from',   'further',              'had',     'hadn',   'has',      'hasn',   'have',   'haven',               'having',               'he',       'her',      'here',    'hers',    'herself',              'him',     'himself',               'his',       'how',    'i',           'if',         'in',         'into',     'is',         'isn',       'it',         'its',        'itself',               'just',     'll',          'm',        'ma',      'me',      'mightn',              'more',  'most',   'mustn', 'my',               'myself',               'needn', 'now',    'o',         'of',        'off',      'on',       'once',   'only',    'or',               'other',  'our',      'ours',    'ourselves',          'out',      'over',    'own',    're',        's',          'same',               'shan',   'she',      'should',               'so',        'some',  'such',    't',          'than',    'that',    'the',               'their',   'theirs',  'them',  'themselves',      'then',    'there',  'these',  'they',    'this',     'those',               'through',            'to',        'too',     'under', 'until',    'up',       've',        'very',    'was',     'we',               'were',   'weren',               'what',   'when',  'where',               'which', 'while',  'who',    'whom',               'why',    'will',      'with',    'won',    'y',          'you',     'your',    'yours',  'yourself',               'yourselves'])
    exclude = set(string.punctuation)
    exclude1= set(custom_list)
    stop_words.update(exclude1)
    lemma = WordNetLemmatizer()
    stop_free = " ".join([i for i in tweet.lower().split() if i not in stop_words])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

########## Pass custom list of stop words in the following list "custom_list"
	
custom_list=["fall"]

####### Load input data for text analysis 
text1 = pd.read_csv(r'path.csv', sep=',', encoding='iso-8859-1')

text1.dropna(axis=0,how='any',inplace=True)

#### Here "Text" column is our target column 
text=text1["Text"].values.tolist()

####### mention number of topics 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

### text column was converted into a list for easy iterations 
tweet_text=text  # text is a list of column to be modelled
doc_complete=tweet_text
#doc_complete_2=[]
doc_complete_2=tweet_text


a = 0 
 
### For gensim we need to tokenize the data and filter out stopwords

tokenized_data = []
for w in doc_complete_2:
    a=a+1
    tokenized_data.append(data_cleaning(w,custom_list))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form

corpus = [dictionary.doc2bow(text) for text in tokenized_data] 
# Have a look at how the Nth document looks like: [(word_id, count), ...]
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 # Build the LDA model

### Lets create a LDA model object specifiying corpus , number of topics 5,6 ... and the words dictionary 

lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
final=[]
for topic in lda_model.show_topics(num_topics=NUM_TOPICS, formatted=False, num_words=6):
    topicwords = [w for (w, val) in topic[1]]
    topicwords_val = [val for (w, val) in topic[1]]
    final.append([topicwords,topicwords_val])
final1=pd.DataFrame(final,columns=["topic","prob"])



final1.to_csv(path+"/topics.csv")

The number of topics (n_topics) as a parameter. None of the algorithms can infer the number of topics in the document collection
All of the algorithms have as input the Document-Word Matrix (or Document-Term Matrix). DWM[i][j] = The number of occurrences of word_j in document_i
All of them output 2 matrices: WTM (Word Topic Matrix) and TDM (Topic Document Matrix). The matrices are significantly smaller and the result of their multiplication should be as close as possible to the original DWM matrix.

N Gram Analysis

########################################N-gram analysis

import re
import string
from nltk.corpus import stopwords
import string
from collections import Counter
from nltk import ngrams

############## Creating UDF for ngram analysis , n means number of grams 1,2,3 ....


############ Data cleaning for Ngram analysis 
def clean_text(text,custom_list):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    cleaned_text1 = [t for t in cleaned_text if t not in custom_list]
    return cleaned_text

def cal_ngram(text,n):
    token = nltk.word_tokenize(text)
    n_grams = ngrams(token,n)
    return n_grams

############# 
n = 3 #### Here n means number of grams

n_gram = cal_ngram(text[0],n)   # generating N grams for input data

### Counter library 
n_gram_common = Counter(n_gram).most_common()  # Listing top 10 trending N grams

n_gram_df=pd.DataFrame(n_gram_common,columns=["N Gram","Frquency"])

#### Now this n gram results can be used to publish a word cloud