forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgit_code_churn.py
115 lines (76 loc) · 2.67 KB
/
git_code_churn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from dateutil import parser
import subprocess
import os
import re
import sys
import numpy as np
from pandas import *
repo_path = '/home/wesm/code/pandas'
githist = ('git log --pretty=format:\"%h %ad | %s%d [%an]\" --date=short ' +
repo_path + ' > githist.txt')
def rungithist():
os.system(githist)
def get_commit_history():
# return TimeSeries
rungithist()
githist = open('githist.txt').read()
os.remove('githist.txt')
sha_date = []
for line in githist.split('\n'):
sha_date.append(line.split()[:2])
shas, dates = zip(*sha_date)
hists = dict(zip(shas, githist.split('\n')))
dates = [parser.parse(d) for d in dates]
return Series(dates, shas), hists
def get_commit_churn(sha, prev_sha):
stdout = subprocess.Popen(['git', 'diff', sha, prev_sha, '--numstat'],
stdout=subprocess.PIPE).stdout
stdout = stdout.read()
insertions = {}
deletions = {}
for line in stdout.split('\n'):
try:
i, d, path = line.split('\t')
insertions[path] = int(i)
deletions[path] = int(d)
except: # EAFP
pass
# statline = stdout.split('\n')[-2]
# match = re.match('.*\s(.*)\sinsertions.*\s(.*)\sdeletions', statline)
# insertions = int(match.group(1))
# deletions = int(match.group(2))
return insertions, deletions
def get_code_churn(commits):
shas = commits.index[::-1]
prev = shas[0]
insertions = [np.nan]
deletions = [np.nan]
insertions = {}
deletions = {}
for cur in shas[1:]:
i, d = get_commit_churn(cur, prev)
insertions[cur] = i
deletions[cur] = d
# insertions.append(i)
# deletions.append(d)
prev = cur
return Panel({'insertions' : DataFrame(insertions),
'deletions' : DataFrame(deletions)}, minor_axis=shas)
# return DataFrame({'insertions' : insertions,
# 'deletions' : deletions}, index=shas)
if __name__ == '__main__':
commits, hists = get_commit_history()
churn = get_code_churn(commits)
file_include = []
for path in churn.major_axis:
if path.endswith('.pyx') or path.endswith('.py'):
file_include.append(path)
commits_include = [sha for sha in churn.minor_axis
if 'LF' not in hists[sha]]
commits_include.remove('dcf3490')
clean_churn = churn.reindex(major=file_include, minor=commits_include)
by_commit = clean_churn.sum('major').sum(1)
by_date = by_commit.groupby(commits).sum()
by_date = by_date.drop([datetime(2011, 6, 10)])
# clean out days where I touched Cython
by_date = by_date[by_date < 5000]