forked from shibing624/github-hot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
101 lines (80 loc) · 3.32 KB
/
crawler.py
File metadata and controls
101 lines (80 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import datetime
import os
from codecs import open
import pandas as pd
import requests
from pyquery import PyQuery
def git_add_commit_push(date, filename):
cmd_git_add = 'git add {filename}'.format(filename=filename)
cmd_git_commit = 'git commit -m "{date}"'.format(date=date)
cmd_git_push = 'git push -u origin master'
os.system(cmd_git_add)
os.system(cmd_git_commit)
os.system(cmd_git_push)
def create_markdown(date, filename):
with open(filename, 'w', encoding='utf-8') as f:
f.write("## " + date + " Github Trending\n")
def scrape(language, filename, topk=5):
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
url = 'https://github.com/trending/{language}'.format(language=language)
r = requests.get(url, headers=HEADERS)
assert r.status_code == 200
d = PyQuery(r.content)
items = d('div.Box article.Box-row')
ds = []
for item in items:
i = PyQuery(item)
title = i(".lh-condensed a").text()
description = i("p.col-9").text()
url = i(".lh-condensed a").attr("href")
url = "https://github.com" + url
star_fork = i(".f6 a").text().strip()
star, fork = star_fork.split()
new_star = i(".f6 svg.octicon-star").parent().text().strip().split()[1]
star = int(star.replace(',', ''))
fork = int(fork.replace(',', ''))
new_star = int(new_star.replace(',', ''))
ds.append([title, url, description, star, fork, new_star])
save_to_md(ds, filename, language, topk)
def save_to_md(ds, filename, language, topk=5):
df = pd.DataFrame(ds, columns=['title', 'url', 'description', 'star', 'fork', 'new_star'])
df.sort_values(by=['new_star', 'star', 'fork'], ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df = df.head(topk)
with open(filename, "a", "utf-8") as f:
f.write('\n### {language}\n'.format(language=language))
for i in range(len(df)):
title = df.iloc[i]['title']
url = df.iloc[i]['url']
description = df.iloc[i]['description']
star = df.iloc[i]['star']
fork = df.iloc[i]['fork']
new_star = df.iloc[i]['new_star']
out = "* [{title}]({url}): {description} ***Star:{stars} Fork:{fork} Today stars:{new_star}***\n".format(
title=title, url=url, description=description, stars=star, fork=fork, new_star=new_star)
f.write(out)
def job():
today_str = datetime.datetime.now().strftime('%Y-%m-%d')
filename = 'markdowns/{date}.md'.format(date=today_str)
# create markdown file
create_markdown(today_str, filename)
# write markdown
scrape('', filename, topk=10) # full_url = 'https://github.com/trending?since=daily'
scrape('python', filename)
scrape('java', filename)
scrape('javascript', filename)
scrape('go', filename)
print('save markdown file to {filename}'.format(filename=filename))
# git_add_commit_push(strdate, filename)
if __name__ == '__main__':
job()