forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgit_code_churn.py
119 lines (79 loc) · 2.81 KB
/
git_code_churn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from dateutil import parser
import subprocess
import os
import re
import sys
import numpy as np
from pandas import *
repo_path = '/home/wesm/code/pandas'
githist = ('git log --pretty=format:\"%h %ad | %s%d [%an]\" --date=short ' +
repo_path + ' > githist.txt')
def rungithist():
os.system(githist)
def get_commit_history():
# return TimeSeries
rungithist()
githist = open('githist.txt').read()
os.remove('githist.txt')
sha_date = []
for line in githist.split('\n'):
sha_date.append(line.split()[:2])
shas, dates = zip(*sha_date)
hists = dict(zip(shas, githist.split('\n')))
dates = [parser.parse(d) for d in dates]
return Series(dates, shas), hists
def get_commit_churn(sha, prev_sha):
stdout = subprocess.Popen(['git', 'diff', sha, prev_sha, '--numstat'],
stdout=subprocess.PIPE).stdout
stdout = stdout.read()
insertions = {}
deletions = {}
for line in stdout.split('\n'):
try:
i, d, path = line.split('\t')
insertions[path] = int(i)
deletions[path] = int(d)
except: # EAFP
pass
# statline = stdout.split('\n')[-2]
# match = re.match('.*\s(.*)\sinsertions.*\s(.*)\sdeletions', statline)
# insertions = int(match.group(1))
# deletions = int(match.group(2))
return insertions, deletions
def get_code_churn(commits):
shas = commits.index[::-1]
prev = shas[0]
insertions = [np.nan]
deletions = [np.nan]
insertions = {}
deletions = {}
for cur in shas[1:]:
i, d = get_commit_churn(cur, prev)
insertions[cur] = i
deletions[cur] = d
# insertions.append(i)
# deletions.append(d)
prev = cur
return Panel({'insertions' : DataFrame(insertions),
'deletions' : DataFrame(deletions)}, minor_axis=shas)
# return DataFrame({'insertions' : insertions,
# 'deletions' : deletions}, index=shas)
if __name__ == '__main__':
# commits, hists = get_commit_history()
# churn = get_code_churn(commits)
from vbench.git import GitRepo
repo = GitRepo('/Users/wesm/code/pandas')
churn = repo.get_churn_by_file()
file_include = []
for path in churn.major_axis:
if path.endswith('.pyx') or path.endswith('.py'):
file_include.append(path)
commits_include = [sha for sha in churn.minor_axis
if 'LF' not in repo.messages[sha]]
commits_include.remove('dcf3490')
clean_churn = churn.reindex(major=file_include, minor=commits_include)
by_commit = clean_churn.sum('major').sum(1)
by_date = by_commit.groupby(repo.commit_date).sum()
by_date = by_date.drop([datetime(2011, 6, 10)])
# clean out days where I touched Cython
by_date = by_date[by_date < 5000]