爬取美图 | ntssl.cn

爬取美图

这里对mm139 这个网站进行了美图爬取。

程序源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*-coding:utf-8 -*-
import requests
import re, os, threading,time


path='/data/image'

class myThread(threading.Thread):
def __init__(self, url, dir, filename):
threading.Thread.__init__(self)
self.threadID = filename
self.url = url
self.dir = dir
self.filename = filename

def run(self):
download_pic(self.url, self.dir, self.filename)


def download_pic(url, dir, filename):
req = requests.get(url=url, headers=headers)
if req.status_code == 200:
new_path=path+'/'+str(dir)
if os.path.exists(new_path) == False:
os.makedirs(new_path)
with open(new_path + '/' + str(filename) + '.jpg', 'wb') as f:
f.write(req.content)


flag = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name',
'Referer': 'https://www.mm131.net/'}
while True:
if flag == 1:
get = requests.get('https://www.mm131.net/xiaohua/')
b = re.findall(r'<dd><a target="_blank" href="https://www.mm131.net/xiaohua/([0-9]*).html">', get.text)
print(b)
for a in b:
getpage = requests.get('https://www.mm131.net/xiaohua/' + str(a) + '.html')
tittle = re.findall(r'<h5>(.*)</h5>', str(getpage.content, 'gb2312', errors='ignore'))
pages = []
threads = []
pages = re.findall(r'<span class="page-ch">共(.*?)页</span>', str(getpage.content, 'gb2312', errors='ignore'))
page = pages[0]
download_url = 'https://img1.mm131.me/pic/' + str(a) + '/'
for t in tittle:
new_t=path+'/'+str(t)
if os.path.exists(new_t) == False:
os.makedirs(new_t)
print('开始下载:' + new_t)
for page_img in range(int(page)):
download_img_url = download_url + str(page_img) + '.jpg'

thread = myThread(download_img_url, t, page_img)
thread.start()
threads.append(thread)
for t in threads:
t.join()
print('下载完成')
time.sleep(5)
else:
print('文件夹已存在,跳过')
flag = flag + 1
print('---------->',flag,'---->这一页的任务已经完成了')
print('结束 if')
else:
print('进入else')
get = requests.get('https://www.mm131.net/xiaohua/list_2_' + str(flag) + '.html')
if get.status_code == 200:
b = re.findall(r'<dd><a target="_blank" href="https://www.mm131.net/xiaohua/([0-9]*).html">', get.text)
print(b)
for a in b:
getpage = requests.get('https://www.mm131.net/xiaohua/' + str(a) + '.html')
tittle = re.findall(r'<h5>(.*)</h5>', str(getpage.content, 'gb2312', errors='ignore'))
pages = []
threads = []
pages = re.findall(r'<span class="page-ch">共(.*?)页</span>',
str(getpage.content, 'gb2312', errors='ignore'))
page = pages[0]
download_url = 'https://img1.mm131.me/pic/' + str(a) + '/'
for t in tittle:
new_t = path + '/' + str(t)
if os.path.exists(new_t) == False:
os.makedirs(new_t)
print('开始下载:' + new_t)
for page_img in range(int(page)):
download_img_url = download_url + str(page_img) + '.jpg'

thread = myThread(download_img_url, t, page_img)
thread.start()
threads.append(thread)
for t in threads:
t.join()
print('下载完成')
time.sleep(5)
else:
print('文件夹已存在,跳过')
flag = flag + 1
print('---------->',flag,'---->这一页的任务已经完成了')
print('结束 else')
else:
break

配置nginx

在nginx中增加这样一条配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
location /image/ {
root /data/;
charset utf-8,gbk;
autoindex on;
autoindex_exact_size off;
autoindex_localtime on;
}
location /html {
root /usr/local/nginx;
index waterfall_lazyload.html;
proxy_set_header Host $host;
proxy_set_header X-Real-Ip $remote_addr;
proxy_set_header X-Forwarded-For $remote_addr;

}

配置html页面

这里可以直接查看git上写好的源码 ↓
git上的ht页面

效果图如下

—->效果图<—-

总结

还有很多需要完善的地方。
1、图片没有自动放置在html 目录下的image下,这里需要配置脚本进行自动化。
2、爬虫脚本有待重构。