1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import asyncio
import re
import aiohttp

from datetime import datetime
from lxml import etree


class DomainExpiryDateGetter:
def __init__(self, domain):
self.result = None
self.q = asyncio.Queue(loop=asyncio.new_event_loop())
# 注册爬取的网站及其回调的解析函数
self.callback_dict = {
f'http://whois.chinaz.com/{domain}': self.parse_chinaz,
f'http://whois.xinnet.com/domains_srv/{domain}': self.parse_xinnet,
# f'http://whois.webmasterhome.cn/?domain={domain}': self.parse_webmasterhome,
}

self.headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.77 Safari/537.36'
}

async def request(self, url):
async with aiohttp.ClientSession() as sesssion:
try:
async with sesssion.get(url=url, headers=self.headers) as resp:
if resp.status == 200:
return await resp.text()
else:
print(f'--- request status error:{url} ---')
except aiohttp.ClientConnectionError:
print(f'--- requests connection error:{url} ---')

def parse_chinaz(self, resp):
ele = etree.HTML(resp).xpath('//a[@id="update_a2"]/parent::*/span')
if ele:
expirydate = ele[0].text
return self.to_timestamp(datetime.strptime(expirydate,
"%Y年%m月%d日"))

def parse_xinnet(self, resp):
ret = re.search(r'Registry Expiry Date: (.*?)<br/>', resp)
if ret:
expirydate = ret.group(1)
return self.to_timestamp(
datetime.strptime(expirydate, "%Y-%m-%dT%H:%M:%SZ"))

def parse_webmasterhome(self, resp):
...

def to_timestamp(self, t):
return int(t.timestamp())

async def crawl_domain(self, url, parser):
"""
爬取
"""
resp = await self.request(url)
if resp:
expirydate = parser(resp)
return expirydate

async def handle_tasks(self, task_id):
"""
处理爬取队列
"""
while not self.q.empty():
url, parser = await self.q.get()
print(
f'======= thread No.{task_id + 1} had started ,request: {url} ======='
)
# 该域名的过期时间
response = await self.crawl_domain(url, parser)
if response:
self.result = response
break

def run(self, max_threads=3):
for url, parser in self.callback_dict.items():
# 将待爬取的url和回调函数放入队列
self.q.put_nowait((url, parser))

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
tasks = [self.handle_tasks(task_id) for task_id in range(max_threads)]
loop.run_until_complete(asyncio.wait(tasks))


def domain_spider(domain, max_threads=3):
d = DomainExpiryDateGetter(domain)
d.run(max_threads)
return d.result
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import logging
import asyncio

import aiohttp
from pyquery import PyQuery as pq


class NhentaiSpider:
def __init__(self, language, max_pages, max_threads):
self.language = language
self.max_pages = max_pages
self.max_threads = max_threads

self.results = []
self.q = asyncio.Queue()

self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
self.detail_url_pattern = re.compile(r'galleries\/(\d+)\/(\d+)t', re.S)
self.language_pattern_dict = {
'All': r'^.*$',
'Chinese': r'^.*([Cc]hinese|汉化|漢化).*$',
'English': r'^.*([Ee]nglish).*$',
'Japan': r'^((?<![Cc]hinese|[Ee]nglish).)*$'
}

async def __request_dom(self, url):
async with aiohttp.ClientSession() as sesssion:
try:
async with sesssion.get(url=url, headers=self.headers) as resp:
if resp.status == 200:
return (await resp.text())
else:
print('request status error')
except aiohttp.ClientConnectionError:
print('requests connection error')

def __parse_index(self, index_html):
pattern = re.compile(self.language_pattern_dict[self.language], re.S)
div_tags = pq(index_html)('.container.index-container .gallery').items()
for div_tag in div_tags:
try:
title = pattern.search(div_tag('.caption').text()).group()
except AttributeError:
pass
else:
x = {
'title': title,
'commic_url': 'https://nhentai.net' + div_tag('a[class="cover"]').attr('href'),
'face_img_url': div_tag('noscript img').attr('src'),
'pages_url': [],
}
self.results.append(x)

def __parse_detail(self, detail_html):
div_tags = pq(detail_html)('.thumb-container a img').items()
commic_pages_list = []
for div_tag in div_tags:
png_url = div_tag.attr('data-src')
if png_url:
result_url = self.detail_url_pattern.search(png_url).groups()

img_url = 'https://i.nhentai.net/galleries/{}/{}.jpg'.format(*result_url)
commic_pages_list.append(img_url)
return commic_pages_list

async def get_results(self, url):
'''抓取单个url'''
html = await self.__request_dom(url)
if html:
self.__parse_index(html)

for each_commic in self.results:
each_commic_url = each_commic['commic_url']
detail_html = await self.__request_dom(each_commic_url)
commic_pages_list = self.__parse_detail(detail_html)
each_commic['pages_url'] = commic_pages_list

async def handle_tasks(self, task_id):
while not self.q.empty():
current_url = await self.q.get()
try:
task_status = await self.get_results(current_url)
except Exception as e:
logging.exception('Error for {}'.format(current_url), exc_info=True)

def run(self):
for page in range(1, self.max_pages + 1):
url = f'https://nhentai.net/?page={page}'
self.q.put_nowait(url)

loop = asyncio.get_event_loop()
tasks = [self.handle_tasks(task_id) for task_id in range(self.max_threads)]
loop.run_until_complete(asyncio.wait(tasks))


def main():
# 抓取本子的语言(其他语言丢弃)
# 填入All/Chinese/Japan/English
language = 'Chinese'
max_page = 5
max_threads = 2

nhentaispider = NhentaiSpider(language, max_page, max_threads)
nhentaispider.run()
print(nhentaispider.results)


if __name__ == '__main__':
main()