from ggplot import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

/usr/local/lib/python2.7/site-packages/ggplot/utils.py:81: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
/usr/local/lib/python2.7/site-packages/ggplot/stats/smoothers.py:4: FutureWarning: The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead
  from pandas.lib import Timestamp
/usr/local/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-819c9ac1830d> in <module>()
      1 from ggplot import *
----> 2 from pyspark.sql.types import *
      3 from pyspark.sql.functions import *
      4 
      5 import pandas as pd

ImportError: No module named pyspark.sql.types


df = sqlContext.read.csv("hdfs://bdp1:8020/pullreqs.csv", 
                         sep=",", header=True, inferSchema=True).cache()
df = sqlContext.read.csv("../datasets/pullreqs.csv", 
                         sep=",", 
                         header=True, 
                         inferSchema=True).cache()
sqlContext.registerDataFrameAsTable(df, "pullreqs")


df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- pull_req_id: integer (nullable = true)
 |-- project_name: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- github_id: integer (nullable = true)
 |-- created_at: integer (nullable = true)
 |-- merged_at: string (nullable = true)
 |-- closed_at: integer (nullable = true)
 |-- lifetime_minutes: integer (nullable = true)
 |-- mergetime_minutes: string (nullable = true)
 |-- merged_using: string (nullable = true)
 |-- conflict: boolean (nullable = true)
 |-- forward_links: boolean (nullable = true)
 |-- intra_branch: boolean (nullable = true)
 |-- description_length: integer (nullable = true)
 |-- num_commits: integer (nullable = true)
 |-- num_commits_open: integer (nullable = true)
 |-- num_pr_comments: integer (nullable = true)
 |-- num_issue_comments: integer (nullable = true)
 |-- num_commit_comments: integer (nullable = true)
 |-- num_comments: integer (nullable = true)
 |-- num_commit_comments_open: integer (nullable = true)
 |-- num_participants: integer (nullable = true)
 |-- files_added_open: integer (nullable = true)
 |-- files_deleted_open: integer (nullable = true)
 |-- files_modified_open: integer (nullable = true)
 |-- files_changed_open: integer (nullable = true)
 |-- src_files_open: integer (nullable = true)
 |-- doc_files_open: integer (nullable = true)
 |-- other_files_open: integer (nullable = true)
 |-- files_added: integer (nullable = true)
 |-- files_deleted: integer (nullable = true)
 |-- files_modified: integer (nullable = true)
 |-- files_changed: integer (nullable = true)
 |-- src_files: integer (nullable = true)
 |-- doc_files: integer (nullable = true)
 |-- other_files: integer (nullable = true)
 |-- src_churn_open: integer (nullable = true)
 |-- test_churn_open: integer (nullable = true)
 |-- src_churn: integer (nullable = true)
 |-- test_churn: integer (nullable = true)
 |-- new_entropy: double (nullable = true)
 |-- entropy_diff: double (nullable = true)
 |-- commits_on_files_touched: integer (nullable = true)
 |-- commits_to_hottest_file: integer (nullable = true)
 |-- hotness: double (nullable = true)
 |-- at_mentions_description: integer (nullable = true)
 |-- at_mentions_comments: integer (nullable = true)
 |-- perc_external_contribs: double (nullable = true)
 |-- sloc: integer (nullable = true)
 |-- test_lines_per_kloc: double (nullable = true)
 |-- test_cases_per_kloc: double (nullable = true)
 |-- asserts_per_kloc: double (nullable = true)
 |-- stars: integer (nullable = true)
 |-- team_size: integer (nullable = true)
 |-- workload: integer (nullable = true)
 |-- ci: string (nullable = true)
 |-- requester: string (nullable = true)
 |-- closer: string (nullable = true)
 |-- merger: string (nullable = true)
 |-- prev_pullreqs: integer (nullable = true)
 |-- requester_succ_rate: double (nullable = true)
 |-- followers: integer (nullable = true)
 |-- main_team_member: boolean (nullable = true)
 |-- social_connection: boolean (nullable = true)
 |-- prior_interaction_issue_events: integer (nullable = true)
 |-- prior_interaction_issue_comments: integer (nullable = true)
 |-- prior_interaction_pr_events: integer (nullable = true)
 |-- prior_interaction_pr_comments: integer (nullable = true)
 |-- prior_interaction_commits: integer (nullable = true)
 |-- prior_interaction_commit_comments: integer (nullable = true)
 |-- first_response: integer (nullable = true)
 |-- prior_interaction_comments: integer (nullable = true)
 |-- prior_interaction_events: integer (nullable = true)
 |-- has_ci: boolean (nullable = true)
 |-- merged: boolean (nullable = true)


sqlContext.sql("""select merged_using, count(*) as occurences 
                  from pullreqs 
                  group by merged_using 
                  order by occurences desc""").show()

+--------------------+----------+
|        merged_using|occurences|
+--------------------+----------+
|              github|    364528|
|   commits_in_master|    342339|
|             unknown|    138566|
|  merged_in_comments|     29273|
|commit_sha_in_com...|     23234|
|     fixes_in_commit|     18125|
+--------------------+----------+


df.\
    groupBy(df.merged_using).\
    agg({'merged_using': 'count'}).\
    orderBy(desc("count(merged_using)")).\
    show()

+--------------------+-------------------+
|        merged_using|count(merged_using)|
+--------------------+-------------------+
|              github|             364528|
|   commits_in_master|             342339|
|             unknown|             138566|
|  merged_in_comments|              29273|
|commit_sha_in_com...|              23234|
|     fixes_in_commit|              18125|
+--------------------+-------------------+


sqlContext.sql("""select count(distinct(project_name)) as num_projects 
                  from pullreqs""").show()

+------------+
|num_projects|
+------------+
|        5543|
+------------+


sqlContext.sql("""select lang, count(*) as num_projects 
                  from (
                      select distinct(project_name), lang 
                      from pullreqs
                  ) as project_langs
                  group by lang
                  order by num_projects desc""").show()

+----------+------------+
|      lang|num_projects|
+----------+------------+
|javascript|        1726|
|    python|        1518|
|      ruby|        1086|
|      java|        1075|
|     scala|         138|
+----------+------------+


df.select(df["project_name"], df["lang"]).\
    distinct().\
    groupBy(df["lang"]).\
    agg({"lang":"count"}).\
    orderBy("count(lang)", ascending=False).\
    show()

+----------+-----------+
|      lang|count(lang)|
+----------+-----------+
|javascript|       1726|
|    python|       1518|
|      ruby|       1086|
|      java|       1075|
|     scala|        138|
+----------+-----------+


r = sqlContext.sql("""select project_name, count(*) as num_prs
                  from pullreqs
                  group by project_name""").toPandas()
ggplot(aes(x = 'num_prs'), data=r) + \
    geom_histogram(binwidth=5) + \
    scale_x_log() + \
    ylab("Num projects") + xlab("Num PRs")

<ggplot: (277803345)>


df.withColumn("duration", df['closed_at'] - df['created_at']).\
    select('duration').\
    toPandas().\
    apply(lambda x: x / 3600).\
    apply(np.log).\
    plot.\
    box()

<matplotlib.axes._subplots.AxesSubplot at 0x108ea0e50>


df.select('num_participants').toPandas().describe()


df.select('num_comments').toPandas().describe()


df.groupBy('merged').count().toPandas()


feature_cols = [
    'intra_branch',
    'description_length',
    'num_commits_open',
    'num_commit_comments_open',
    'files_added_open',
    'files_deleted_open',
    'files_changed_open',
    'doc_files_open',
    'other_files_open',
    'src_churn_open',
    'test_churn_open',
    'new_entropy',
    'hotness',
    'at_mentions_description',
    'perc_external_contribs',
    'test_cases_per_kloc',
    'requester_succ_rate',
    'followers',
    'main_team_member',
    'social_connection',
    'prior_interaction_events',
    'has_ci'
]

response = 'merged'

# We drop any possible duplicates and cache the resulting data frame
data = df.select(feature_cols).dropDuplicates().cache()


from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# All boolean columns
boolean = ['intra_branch', 'main_team_member', 
           'social_connection', 'has_ci']
boolean_out = map(lambda x: x + "_int", boolean)

# Update the feature_cols with information about the new column names
feature_cols = [item for item in set(feature_cols) \
                if item not in set(boolean)] + \
                boolean_out

# Type cast boolean columns to Integers (convert to numeric)
for x in boolean:
    df = df.withColumn(x + "_int", df[x].cast(IntegerType()))
df = df.withColumn("merged_int", df['merged'].cast(IntegerType()))

# Convert feature columns to a numeric vector
assembler_features = VectorAssembler(inputCols=feature_cols, 
                                     outputCol='features')

# Construct and execute a pipeline, cache the results.
pipeline = Pipeline(stages=[assembler_features])
allData = pipeline.fit(df).transform(df).cache()


(trainingData, testData) = allData.randomSplit([0.9, 0.1], seed=42)


from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Calculate and return the AUC metric
def evaluate(testData, predictions):
    evaluator = BinaryClassificationEvaluator(labelCol="merged_int", 
                                              rawPredictionCol="rawPrediction")
    print "AUC: %f" % evaluator.evaluate(predictions)


from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, 
                        elasticNetParam=0.8 ,labelCol="merged_int")

lrModel = lr.fit(trainingData)
evaluate(testData, lrModel.transform(testData))

AUC: 0.500000


from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1,labelCol="merged_int")

lsvcModel = lsvc.fit(trainingData)
evaluate(testData, lsvcModel.transform(testData))

AUC: 0.662181


from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="merged_int")

dtModel = dt.fit(trainingData)
evaluate(testData, dtModel.transform(testData))

AUC: 0.751664


from pyspark.ml.classification import RandomForestClassifier as RF
from pyspark.ml.evaluation import BinaryClassificationEvaluator

rf = RF(labelCol='merged_int', featuresCol='features', 
        numTrees=100, maxDepth=5)
rfModel = rf.fit(trainingData)
evaluate(testData, rfModel.transform(testData))

AUC: 0.787138


pd.DataFrame(data=zip(feature_cols, rfModel.featureImportances), 
             columns=("feature", "importance")).\
            sort_values("importance", ascending=False).\
            head(n = 10)


from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10, maxDepth=5, 
                    labelCol="merged_int", seed=42)
gbtModel = gbt.fit(trainingData)
evaluate(testData, gbtModel.transform(testData))

AUC: 0.797577


pd.DataFrame(data=zip(feature_cols, gbtModel.featureImportances), 
             columns=("feature", "importance")).\
            sort_values("importance", ascending=False).\
            head(n = 10)


from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [4, 8])
             .addGrid(dt.maxBins, [16, 32, 64])
             .addGrid(dt.minInstancesPerNode, [1, 2],)
             .build())

evaluator = BinaryClassificationEvaluator(labelCol="merged_int",
                                          rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=dt, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=3)
cvModel = cv.fit(trainingData)


evaluate(testData, cvModel.transform(testData))

AUC: 0.758221


cvModel.bestModel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_42a99d738fd535e9c533) of depth 4 with 31 nodes


def missprediction_rate(model, testData):
    predictions = model.transform(testData)
    predictions = predictions.\
        select(predictions.merged_int.cast("double").alias('ground_truth'), \
               'prediction')

    true_predictions = predictions.\
                       where(predictions.ground_truth == 1.0).\
                       count()
    false_predictions = predictions.\
                        where(predictions.ground_truth == 0.0).\
                        count()
    true_misspredictions = predictions.\
                          where(predictions.ground_truth != predictions.prediction).\
                          where(predictions.ground_truth == 1.0).\
                          count()
    false_misspredictions = predictions.\
                            where(predictions.ground_truth != predictions.prediction).\
                            where(predictions.ground_truth == 0.0).\
                            count()

    return ((float(false_misspredictions) / false_predictions) * 100),\
        ((float(true_misspredictions) / true_predictions) * 100)

print "SVM missprediction rate: False: %f, True: %f" % \
        missprediction_rate(lsvcModel, testData)
print "RF missprediction rate: False: %f, True: %f" % \
        missprediction_rate(rfModel, testData)
print "GBT missprediction rate: False: %f, True: %f" % \
        missprediction_rate(gbtModel, testData)
print "CVModel missprediction rate: False: %f, True: %f" % \
        missprediction_rate(cvModel, testData)

SVM missprediction rate: False: 100.000000, True: 0.001285
RF missprediction rate: False: 85.828935, True: 1.278559
GBT missprediction rate: False: 70.999928, True: 3.000437
CVModel missprediction rate: False: 73.689538, True: 2.821824


minority_class = allData.where(allData.merged_int == 0)
majority_class = allData.where(allData.merged_int != 0).\
    sample(False, 0.5).\
    limit(minority_class.count() * 2)

balancedAllData = minority_class.union(majority_class)
(balancedTrainingData, balancedTestData) = \
        balancedAllData.randomSplit([0.9, 0.1], seed=42)


balancedRfModel = rf.fit(balancedTrainingData)
evaluate(balancedTestData, balancedRfModel.transform(balancedTestData))

print "Balanced RF missprediction rate: False: %f, True: %f" %\
    missprediction_rate(balancedRfModel, balancedTestData)

AUC: 0.783390
Balanced RF missprediction rate: False: 64.615050, True: 6.347009


balancedGBTModel = gbt.fit(balancedTrainingData)
evaluate(balancedTestData, balancedGBTModel.transform(balancedTestData))

print "Balanced GBT missprediction rate: False: %f, True: %f" %\
    missprediction_rate(balancedGBTModel, balancedTestData)

AUC: 0.795024
Balanced GBT missprediction rate: False: 54.965927, True: 8.678860

	num_participants
count	916065.000000
mean	1.332705
std	1.491926
min	0.000000
25%	0.000000
50%	1.000000
75%	2.000000
max	292.000000

	num_comments
count	916065.000000
mean	3.329427
std	9.958141
min	0.000000
25%	0.000000
50%	1.000000
75%	3.000000
max	1242.000000

		Ground truth
		True	False
Prediction	True	TP	FP
	False	FN	TN

Predicting pull request merges¶

Exploring the dataset¶

What are the columns in our CSV file?¶

How many projects and languages are there?¶

What is the PR distribution per project?¶

How much time does it take to close a PR?¶

How many people work on PRs?¶

How many comments do PRs receive?¶

How balanced is the dataset?¶

Predicting pull request merges¶

Logistic regression¶

Linear Support Vector Machine¶

Decision tree¶

Random Forests¶

Gradient boosting¶

Optimising hyper parameters¶

Improving prediction of minority case¶

	feature	importance
1	requester_succ_rate	0.366819
14	prior_interaction_events	0.341332
19	main_team_member_int	0.164281
13	num_commits_open	0.027224
18	intra_branch_int	0.016054
3	new_entropy	0.013582
16	src_churn_open	0.013561
20	social_connection_int	0.010925
5	perc_external_contribs	0.009141
4	description_length	0.008753

	feature	importance
1	requester_succ_rate	0.469043
14	prior_interaction_events	0.158051
19	main_team_member_int	0.093084
20	social_connection_int	0.056571
4	description_length	0.052702
16	src_churn_open	0.031374
18	intra_branch_int	0.025636
9	test_cases_per_kloc	0.022868
15	files_changed_open	0.021664
13	num_commits_open	0.014422

	merged	count
0	True	777499
1	False	138566