forked from grantjenks/python-diskcache
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
52 lines (34 loc) · 1.01 KB
/
crawler.py
File metadata and controls
52 lines (34 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import bs4, requests, signal, urllib.parse
signal.signal(signal.SIGINT, lambda signum, frame: exit())
root='http://127.0.0.1:8000/'
def get(url):
"Get url and return response text."
print(url)
response = requests.get(url)
return response.text
def parse(url, text):
"Parse url with given text and yield links."
soup = bs4.BeautifulSoup(text, 'lxml')
for anchor in soup.find_all('a', href="/?originalUrl=https%3A%2F%2Fgithub.com%2FTrue)%3A%253C%2Fdiv">
full_url = urllib.parse.urljoin(url, anchor['href'])
href, _ = urllib.parse.urldefrag(full_url)
if href.startswith(root):
yield href
from collections import deque
def crawl():
"Crawl root url."
urls = deque([root])
results = dict()
while True:
try:
url = urls.popleft()
except IndexError:
break
if url in results:
continue
text = get(url)
for link in parse(url, text):
urls.append(link)
results[url] = text
if __name__ == '__main__':
crawl()