add images & update chapter03
|
|
@ -1,41 +1,109 @@
|
||||||
#Github项目分析一
|
#Github项目分析一
|
||||||
|
|
||||||
#用matplotlib生成图表
|
##用matplotlib生成图表
|
||||||
|
|
||||||
如何分析用户的数据是一个有趣的问题,特别是当我们有大量的数据的时候。
|
如何分析用户的数据是一个有趣的问题,特别是当我们有大量的数据的时候。
|
||||||
除了``matlab``,我们还可以用``numpy``+``matplotlib``
|
除了``matlab``,我们还可以用``numpy``+``matplotlib``
|
||||||
|
|
||||||
##python github用户数据分析##
|
###python github用户数据分析##
|
||||||
|
|
||||||
数据可以在这边寻找到
|
数据可以在这边寻找到
|
||||||
|
|
||||||
[https://github.com/gmszone/ml](https://github.com/gmszone/ml)
|
[https://github.com/gmszone/ml](https://github.com/gmszone/ml)
|
||||||
|
|
||||||
最后效果图
|
最后效果图
|
||||||
<img src="https://raw.githubusercontent.com/gmszone/ml/master/screenshots/2014-01-01.png" width=600>
|
|
||||||
|

|
||||||
|
|
||||||
要解析的json文件位于``data/2014-01-01-0.json``,大小6.6M,显然我们可能需要用每次只读一行的策略,这足以解释为什么诸如sublime打开的时候很慢,而现在我们只需要里面的json数据中的创建时间。。
|
要解析的json文件位于``data/2014-01-01-0.json``,大小6.6M,显然我们可能需要用每次只读一行的策略,这足以解释为什么诸如sublime打开的时候很慢,而现在我们只需要里面的json数据中的创建时间。。
|
||||||
|
|
||||||
==
|
==这个文件代表什么?
|
||||||
这个文件代表什么?
|
|
||||||
|
|
||||||
**2014年1月1日零时到一时,用户在github上的操作,这里的用户指的是很多。。一共有4814条数据,从commit、create到issues都有。**
|
**2014年1月1日零时到一时,用户在github上的操作,这里的用户指的是很多。。一共有4814条数据,从commit、create到issues都有。**
|
||||||
|
|
||||||
##python json文件解析##
|
###python json文件解析##
|
||||||
|
|
||||||
import json
|
```python
|
||||||
for line in open(jsonfile):
|
import json
|
||||||
line = f.readline()
|
for line in open(jsonfile):
|
||||||
|
line = f.readline()
|
||||||
|
```
|
||||||
|
|
||||||
然后再解析json
|
然后再解析json
|
||||||
<pre><code class="python">
|
|
||||||
|
```python
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
|
||||||
lin = json.loads(line)
|
lin = json.loads(line)
|
||||||
date = dateutil.parser.parse(lin["created_at"])
|
date = dateutil.parser.parse(lin["created_at"])
|
||||||
</code></pre>
|
```
|
||||||
|
|
||||||
这里用到了``dateutil``,因为新鲜出炉的数据是string需要转换为``dateutil``,再到数据放到数组里头。最后有就有了``parse_data``
|
这里用到了``dateutil``,因为新鲜出炉的数据是string需要转换为``dateutil``,再到数据放到数组里头。最后有就有了``parse_data``
|
||||||
|
|
||||||
|
```python
|
||||||
|
def parse_data(jsonfile):
|
||||||
|
f = open(jsonfile, "r")
|
||||||
|
dataarray = []
|
||||||
|
datacount = 0
|
||||||
|
|
||||||
|
for line in open(jsonfile):
|
||||||
|
line = f.readline()
|
||||||
|
lin = json.loads(line)
|
||||||
|
date = dateutil.parser.parse(lin["created_at"])
|
||||||
|
datacount += 1
|
||||||
|
dataarray.append(date.minute)
|
||||||
|
|
||||||
|
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
||||||
|
f.close()
|
||||||
|
return minuteswithcount
|
||||||
|
```
|
||||||
|
|
||||||
|
下面这句代码就是将上面的解析为
|
||||||
|
|
||||||
|
```python
|
||||||
|
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
||||||
|
```
|
||||||
|
|
||||||
|
这样的数组以便于解析
|
||||||
|
|
||||||
|
```python
|
||||||
|
[(0, 92), (1, 67), (2, 86), (3, 73), (4, 76), (5, 67), (6, 61), (7, 71), (8, 62), (9, 71), (10, 70), (11, 79), (12, 62), (13, 67), (14, 76), (15, 67), (16, 74), (17, 48), (18, 78), (19, 73), (20, 89), (21, 62), (22, 74), (23, 61), (24, 71), (25, 49), (26, 59), (27, 59), (28, 58), (29, 74), (30, 69), (31, 59), (32, 89), (33, 67), (34, 66), (35, 77), (36, 64), (37, 71), (38, 75), (39, 66), (40, 62), (41, 77), (42, 82), (43, 95), (44, 77), (45, 65), (46, 59), (47, 60), (48, 54), (49, 66), (50, 74), (51, 61), (52, 71), (53, 90), (54, 64), (55, 67), (56, 67), (57, 55), (58, 68), (59, 91)]
|
||||||
|
```
|
||||||
|
|
||||||
|
##matplotlib
|
||||||
|
|
||||||
|
开始之前需要安装``matplotlib
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo pip install matplotlib
|
||||||
|
```
|
||||||
|
然后引入这个库
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
如上面的那个结果,只需要
|
||||||
|
|
||||||
|
<pre><code class="python">
|
||||||
|
plt.figure(figsize=(8,4))
|
||||||
|
plt.plot(x, y,label = files)
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
</code></pre>
|
||||||
|
|
||||||
|
最后代码可见
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import dateutil.parser
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.mlab as mlab
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
def parse_data(jsonfile):
|
def parse_data(jsonfile):
|
||||||
f = open(jsonfile, "r")
|
f = open(jsonfile, "r")
|
||||||
dataarray = []
|
dataarray = []
|
||||||
|
|
@ -53,83 +121,27 @@ def parse_data(jsonfile):
|
||||||
return minuteswithcount
|
return minuteswithcount
|
||||||
|
|
||||||
|
|
||||||
下面这句代码就是将上面的解析为
|
def draw_date(files):
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
mwcs = parse_data(files)
|
||||||
|
for mwc in mwcs:
|
||||||
|
x.append(mwc[0])
|
||||||
|
y.append(mwc[1])
|
||||||
|
|
||||||
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
|
||||||
|
|
||||||
这样的数组以便于解析
|
|
||||||
|
|
||||||
[(0, 92), (1, 67), (2, 86), (3, 73), (4, 76), (5, 67), (6, 61), (7, 71), (8, 62), (9, 71), (10, 70), (11, 79), (12, 62), (13, 67), (14, 76), (15, 67), (16, 74), (17, 48), (18, 78), (19, 73), (20, 89), (21, 62), (22, 74), (23, 61), (24, 71), (25, 49), (26, 59), (27, 59), (28, 58), (29, 74), (30, 69), (31, 59), (32, 89), (33, 67), (34, 66), (35, 77), (36, 64), (37, 71), (38, 75), (39, 66), (40, 62), (41, 77), (42, 82), (43, 95), (44, 77), (45, 65), (46, 59), (47, 60), (48, 54), (49, 66), (50, 74), (51, 61), (52, 71), (53, 90), (54, 64), (55, 67), (56, 67), (57, 55), (58, 68), (59, 91)]
|
|
||||||
|
|
||||||
##matplotlib##
|
|
||||||
开始之前需要安装``matplotlib
|
|
||||||
|
|
||||||
sudo pip install matplotlib
|
|
||||||
|
|
||||||
然后引入这个库
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
如上面的那个结果,只需要
|
|
||||||
|
|
||||||
<pre><code class="python">
|
|
||||||
plt.figure(figsize=(8,4))
|
plt.figure(figsize=(8,4))
|
||||||
plt.plot(x, y,label = files)
|
plt.plot(x, y,label = files)
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.show()
|
plt.show()
|
||||||
</code></pre>
|
|
||||||
|
|
||||||
最后代码可见
|
draw_date("data/2014-01-01-0.json")
|
||||||
|
```
|
||||||
|
|
||||||
#!/usr/bin/env python
|
##每周分析
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import json
|
|
||||||
import dateutil.parser
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.mlab as mlab
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
|
|
||||||
def parse_data(jsonfile):
|
|
||||||
f = open(jsonfile, "r")
|
|
||||||
dataarray = []
|
|
||||||
datacount = 0
|
|
||||||
|
|
||||||
for line in open(jsonfile):
|
|
||||||
line = f.readline()
|
|
||||||
lin = json.loads(line)
|
|
||||||
date = dateutil.parser.parse(lin["created_at"])
|
|
||||||
datacount += 1
|
|
||||||
dataarray.append(date.minute)
|
|
||||||
|
|
||||||
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
|
||||||
f.close()
|
|
||||||
return minuteswithcount
|
|
||||||
|
|
||||||
|
|
||||||
def draw_date(files):
|
|
||||||
x = []
|
|
||||||
y = []
|
|
||||||
mwcs = parse_data(files)
|
|
||||||
for mwc in mwcs:
|
|
||||||
x.append(mwc[0])
|
|
||||||
y.append(mwc[1])
|
|
||||||
|
|
||||||
plt.figure(figsize=(8,4))
|
|
||||||
plt.plot(x, y,label = files)
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
draw_date("data/2014-01-01-0.json")
|
|
||||||
|
|
||||||
|
|
||||||
#每周分析
|
|
||||||
|
|
||||||
继上篇之后,我们就可以分析用户的每周提交情况,以得出用户的真正的工具效率,每个程序员的工作时间可能是不一样的,如
|
继上篇之后,我们就可以分析用户的每周提交情况,以得出用户的真正的工具效率,每个程序员的工作时间可能是不一样的,如
|
||||||
![Phodal Huang's Report][1]
|
|
||||||
|
|
||||||
[1]: https://www.phodal.com/static/media/uploads/screen_shot_2014-04-12_at_9.58.52_am.png
|

|
||||||
|
|
||||||
这是我的每周情况,显然如果把星期六移到前面的话,随着工作时间的增长,在github上的使用在下降,作为一个
|
这是我的每周情况,显然如果把星期六移到前面的话,随着工作时间的增长,在github上的使用在下降,作为一个
|
||||||
|
|
||||||
|
|
@ -137,11 +149,11 @@ def parse_data(jsonfile):
|
||||||
|
|
||||||
不过这个是osrc的分析结果。
|
不过这个是osrc的分析结果。
|
||||||
|
|
||||||
##python github 每周情况分析##
|
###python github 每周情况分析
|
||||||
|
|
||||||
看一张分析后的结果
|
看一张分析后的结果
|
||||||
|
|
||||||
<img src="https://raw.githubusercontent.com/gmszone/ml/master/screenshots/feb-results.png" width=600>
|

|
||||||
|
|
||||||
结果正好与我的情况相反?似乎图上是这么说的,但是数据上是这样的情况。
|
结果正好与我的情况相反?似乎图上是这么说的,但是数据上是这样的情况。
|
||||||
|
|
||||||
|
|
@ -174,67 +186,71 @@ def parse_data(jsonfile):
|
||||||
8474, 7984, 12933, 13504, 13763, 13544, 12940,
|
8474, 7984, 12933, 13504, 13763, 13544, 12940,
|
||||||
7119, 7346, 13412, 14008, 12555
|
7119, 7346, 13412, 14008, 12555
|
||||||
|
|
||||||
##python 数据分析##
|
###python 数据分析
|
||||||
|
|
||||||
重写了一个新的方法用于计算提交数,直至后面才意识到其实我们可以算行数就够了,但是方法上有点hack
|
重写了一个新的方法用于计算提交数,直至后面才意识到其实我们可以算行数就够了,但是方法上有点hack
|
||||||
|
|
||||||
<pre><code class="python">
|
```python
|
||||||
def get_minutes_counts_with_id(jsonfile):
|
def get_minutes_counts_with_id(jsonfile):
|
||||||
datacount, dataarray = handle_json(jsonfile)
|
datacount, dataarray = handle_json(jsonfile)
|
||||||
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
|
||||||
return minuteswithcount
|
return minuteswithcount
|
||||||
|
|
||||||
|
|
||||||
def handle_json(jsonfile):
|
def handle_json(jsonfile):
|
||||||
f = open(jsonfile, "r")
|
f = open(jsonfile, "r")
|
||||||
dataarray = []
|
dataarray = []
|
||||||
datacount = 0
|
datacount = 0
|
||||||
|
|
||||||
for line in open(jsonfile):
|
for line in open(jsonfile):
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
lin = json.loads(line)
|
lin = json.loads(line)
|
||||||
date = dateutil.parser.parse(lin["created_at"])
|
date = dateutil.parser.parse(lin["created_at"])
|
||||||
datacount += 1
|
datacount += 1
|
||||||
dataarray.append(date.minute)
|
dataarray.append(date.minute)
|
||||||
|
|
||||||
f.close()
|
f.close()
|
||||||
return datacount, dataarray
|
return datacount, dataarray
|
||||||
|
|
||||||
|
|
||||||
def get_minutes_count_num(jsonfile):
|
def get_minutes_count_num(jsonfile):
|
||||||
datacount, dataarray = handle_json(jsonfile)
|
datacount, dataarray = handle_json(jsonfile)
|
||||||
return datacount
|
return datacount
|
||||||
|
|
||||||
|
|
||||||
def get_month_total():
|
def get_month_total():
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
:rtype : object
|
||||||
|
"""
|
||||||
|
monthdaycount = []
|
||||||
|
for i in range(1, 20):
|
||||||
|
if i < 10:
|
||||||
|
filename = 'data/2014-02-0' + i.__str__() + '-0.json'
|
||||||
|
else:
|
||||||
|
filename = 'data/2014-02-' + i.__str__() + '-0.json'
|
||||||
|
monthdaycount.append(get_minutes_count_num(filename))
|
||||||
|
return monthdaycount
|
||||||
|
```
|
||||||
|
|
||||||
:rtype : object
|
|
||||||
"""
|
|
||||||
monthdaycount = []
|
|
||||||
for i in range(1, 20):
|
|
||||||
if i < 10:
|
|
||||||
filename = 'data/2014-02-0' + i.__str__() + '-0.json'
|
|
||||||
else:
|
|
||||||
filename = 'data/2014-02-' + i.__str__() + '-0.json'
|
|
||||||
monthdaycount.append(get_minutes_count_num(filename))
|
|
||||||
return monthdaycount
|
|
||||||
</code></pre>
|
|
||||||
接着我们需要去遍历每个结果,后面的后面会发现这个效率真的是太低了,为什么木有多线程?
|
接着我们需要去遍历每个结果,后面的后面会发现这个效率真的是太低了,为什么木有多线程?
|
||||||
|
|
||||||
##python matplotlib图表##
|
###python matplotlib图表
|
||||||
|
|
||||||
让我们的matplotlib来做这些图表的工作
|
让我们的matplotlib来做这些图表的工作
|
||||||
|
|
||||||
if __name__ == '__main__':
|
```python
|
||||||
results = pd.get_month_total()
|
if __name__ == '__main__':
|
||||||
print results
|
results = pd.get_month_total()
|
||||||
|
print results
|
||||||
|
|
||||||
plt.figure(figsize=(8, 4))
|
plt.figure(figsize=(8, 4))
|
||||||
plt.plot(results.__getslice__(0, 7), label="first week")
|
plt.plot(results.__getslice__(0, 7), label="first week")
|
||||||
plt.plot(results.__getslice__(7, 14), label="second week")
|
plt.plot(results.__getslice__(7, 14), label="second week")
|
||||||
plt.plot(results.__getslice__(14, 21), label="third week")
|
plt.plot(results.__getslice__(14, 21), label="third week")
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
```
|
||||||
|
|
||||||
蓝色的是第一周,绿色的是第二周,蓝色的是第三周就有了上面的结果。
|
蓝色的是第一周,绿色的是第二周,蓝色的是第三周就有了上面的结果。
|
||||||
|
|
||||||
|
|
|
||||||
BIN
img/2014-01-01.png
Normal file
|
After Width: | Height: | Size: 36 KiB |
BIN
img/echoesworks.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
img/elasticsearch_ionit_map.jpg
Normal file
|
After Width: | Height: | Size: 62 KiB |
BIN
img/feb-results.png
Normal file
|
After Width: | Height: | Size: 40 KiB |
BIN
img/github-200-days.png
Normal file
|
After Width: | Height: | Size: 229 KiB |
BIN
img/github-365.jpg
Normal file
|
After Width: | Height: | Size: 120 KiB |
BIN
img/lan-iot.jpg
Normal file
|
After Width: | Height: | Size: 66 KiB |
BIN
img/lan.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
img/nginx_pig.jpg
Normal file
|
After Width: | Height: | Size: 29 KiB |
BIN
img/phodal-results.png
Normal file
|
After Width: | Height: | Size: 8.6 KiB |
BIN
img/repo-status.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
img/resume.png
Normal file
|
After Width: | Height: | Size: 200 KiB |
BIN
img/screen_shot_2015-05-09_at_23.23.31.png
Normal file
|
After Width: | Height: | Size: 128 KiB |
BIN
img/screenshot.png
Normal file
|
After Width: | Height: | Size: 243 KiB |
BIN
img/skilltree.jpg
Normal file
|
After Width: | Height: | Size: 102 KiB |