环境搭建

安装python,安装pycharm,配置环境变量,检测是否配置成功,控制台输入python

内置模块

  • os模块:系统相关,文件夹和文件相关
  • sys模块:python解释器相关
  • time模块:处理时间的函数
  • datetime模块:高级日期和时间处理
  • random模块:生成随机数的模块
  • math模块:数学函数模块
  • re模块:正则表达式的模块
  • json模块:Json格式的模块
  • urllib模块:网页和url的模块

包管理工具pip

1
2
pip --version
pip list

入门

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

url = "https://movie.douban.com/j/chart/top_list"
param = {
"type": "24",
"interval_id": "100:90",
"action":"",
"start": 0,
"limit":20
}

headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}
resp = requests.get(url,params=param,headers=headers)
# print(resp.url)
#https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start=0&limit=20
print(resp.json())
resp.close()

re

.:匹配除换行以外的任意字符

/w:数字,字母,下划线

/d:数字

*:重复0次或多次

?:重复零次或一次

+:重复一次或多次

.*:贪婪匹配

.*?:惰性匹配

重要哦:finditer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import re
str = "1232,问我的10034,顶顶顶43423"
# 列表,用的不多
list = re.findall(r"\d+",str)
print(list)
# 迭代器,用的多,效率高
l2 = re.finditer(r"\d+", str)
# print(l2)
for i in l2:
print(i.group())
# 和findall差不多,返回结果是match对象,找到一个结果就返回
l3 = re.search(r"\d+", str)
print(l3.group())

# match从头开始匹配
l4 = re.match(r"\d+",str)
print(l4.group())

# 预加载正则compile
obj = re.compile(r"\d+")
l5 = obj.finditer(str)
print(l5)
for it in l5:
print(it.group())

爬取九鼎农场

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
import time
import bs4
import csv


def main(page):
for i in range(0,int(page)):
getData(i)
print(f'第{i+1}页爬取完成')

def getData(val):
url = f"https://www.xj9d.com/cjlist/MarketType/1/ProductType/1.html?page={val}"
resp = requests.get(url)
page = bs4.BeautifulSoup(resp.text, 'html.parser')
table = page.find("table", class_="table")
trs = table.find_all("tr")[1:]
f = open('data.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(['产品', '批发价', '货损', '单位', '产地', '市场', '时间'])
for i in trs:
tds = i.find_all("td")
name = tds[0].text
price = tds[1].text
huosun = tds[2].text
unit = tds[3].text
address = tds[4].text
depart = tds[5].text
timer = tds[8].text
print(name,price,huosun,unit,address,depart,timer)
writer.writerow([name,price,huosun,unit,address,depart,timer])
time.sleep(1)

resp.close()


if __name__ == "__main__":
page = input("请输入爬取的页数:")
main(page)
print('爬虫结束')

爬取彼岸图网

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os.path
import requests
import bs4
import csv

def main(page):
url = f"https://pic.netbian.top/new/index_{page}.html"
resp = requests.get(url)
html = bs4.BeautifulSoup(resp.text, 'html.parser')
ul = html.find('ul', class_='clearfix')
lis = ul.find_all('li')
for i in lis:
a = i.find('a')
href= a.get('href').strip('/')
# print(href)
child_page(href)

def child_page(href):
urls = "https://pic.netbian.top/" + href
resp = requests.get(urls)
html = bs4.BeautifulSoup(resp.text, 'html.parser')
div = html.find('div', class_='photo-pic')

src = div.find('img').get('src')
title = div.find('img').get('title')
img = requests.get(src).content
with open(f'img/{title}.jpg', 'wb') as f:
f.write(img)
print(f'{title}')
f.flush()
# print(src)


if __name__ == '__main__':
page = input('请输入爬取页数:')
if not os.path.exists('img'):
os.mkdir('img')
print('创建文件夹成功')
for i in range(0,int(page)):
main(i+1)
print(f"第{i+1}页爬取成功")

爬取我要个性网

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os.path
import requests

import bs4


def main(page):

url = f'https://www.woyaogexing.com/touxiang/index_{page}.html'
resp = requests.get(url)
resp.encoding='utf-8'
# //获取图片链接
html = bs4.BeautifulSoup(resp.text,'html.parser')
div = html.find('div',{'class':'pMain'})
titles = div.find_all('a',{'class':'img'})
for title in titles:
# print(title)
t = title.get('title').strip()
img = title.find('img',{'class':'lazy'})
src = img.get('src')
href = 'https:' + src
data = requests.get(href).content
# print(src)
print(href)
print(t)
with open(f'img1/{t}.jpg','wb') as f:
f.write(data)

if __name__ == '__main__':
page = input(f'亲,输入页码:')
if not os.path.exists('img1'):
os.mkdir('img1')
print('创建文件夹成功')
for i in range(1,int(page)+1):
main(i+1)
print(f'第{i}页完成')

爬取贴吧评论

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import csv
import random
import re
import requests
from time import sleep

from lxml import etree


def main(page):
url = f'https://tieba.baidu.com/p/7882177660?pn={page}'
getData(url)

def getData(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
r = requests.get(url, headers).text
html = etree.HTML(r)

floor = re.findall(f'端</a></span><span class="tail-info">(.*?)</span><span', r)
#评论
# comment = html.xpath('//*[@class="p_content "]/cc/div[2]/text()')#狗杂碎,中间敲了20个空格,这个方法行不通
comment = re.findall(f'j_d_post_content " style="display:;"> (.*?)</div><br>', r)
# 评论时间
times = re.findall(f'楼</span><span class="tail-info">(.*?)</span><div', r)
print(f'第一页获取完毕')

for fe, c,t in zip(floor,comment,times):
if 'img' in c or 'div' in c:
continue
print(fe,c,t)
writer.writerow((fe,c,t))


if __name__ == '__main__':
print('程序启动')
with open('tieba.csv','a',newline='') as f:
writer = csv.writer(f)
writer.writerow(("楼层","评论内容","评论时间"))
for i in range(3):
main(i)
sleep(1+random.random())

爬取豆瓣250榜单

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import csv
import random
import re
import requests
from time import sleep
from lxml import etree


def main(fn,page):
url = f'https://movie.douban.com/top250?start={page*25}'
# url1 = 'https://movie.douban.com/subject/1292052/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36',
'Host': 'movie.douban.com',
}
r = requests.get(url,headers=headers).text
html = etree.HTML(r)

# 电影名字
movieName = html.xpath('//*[@class="hd"]/a/span[1]/text()')

# 详情链接
infoUrl = html.xpath('//*[@class="hd"]/a/@href')

# 获取电影详情页数据

print(movieName)
print(infoUrl)
for movieName,infoUrl in zip(movieName,infoUrl):
fn.flush() #刷新文件
try:
movie_info(movieName, infoUrl)
except:
pass
sleep(1+random.random()) #休息1秒
print(f'第{page}页爬取完毕')

def movie_info (name,infoUrl):
print(name,infoUrl)
# 获取到了详情地址单独发请求,准备headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36',
'Host': 'movie.douban.com',
}
r = requests.get(infoUrl, headers=headers).text
html = etree.HTML(r)
# 1.导演
director = html.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
print(f'导演{director}')
# 2.类型
types = re.findall(r'property="v:genre">(.*?)</span>', r)
types = '/'.join(types)

# 3.国家
country = re.findall(r'制片国家/地区:</span> (.*?)<br', r)[0]

# 4.上映时间
time = html.xpath('//*[@class="year"]/text()')[0]
time = time[1:5]

# 5.评分
score = html.xpath('//*[@class="ll rating_num"]/text()')[0]
# 6.评论人数
comment = html.xpath('//*[@property="v:votes"]/text()')[0]

print(name,director,types,country,time,score,comment)
writer.writerow((name,director,types,country,time,score,comment))

if __name__ == '__main__':
print('电影爬虫启动')
# 保存数据
with open('movie.csv','a',newline='') as f:
writer = csv.writer(f)
writer.writerow(("电影名称","导演","电影类型","国家","上映时间","评分","评论人数"))
for i in range(3):
main(f,i)
sleep(3+random.random()) #每页完成后休息3秒

爬取qq图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import csv
import os
import random
import requests
import time

from lxml import etree


def main():
page = input('请输入爬取页数开始爬取')
for i in range(0, int(page)):
getData(13-i)
time.sleep(random.randint(1, 3))
print(f'第{i}页爬取完成')
# getData(13)

def getData(page):
url=f'https://qq.yh31.com/love/mn/list_{page}.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61',
}
r =requests.get(url,headers=headers).content
html = etree.HTML(r)
title = html.xpath('//dt/a/img/@alt')
imgs = html.xpath('//dt/a/img/@data-src')
# print(title)
for t,i in zip(title,imgs):
url = 'https://qq.yh31.com'
itemUrl = url+i
# print(itemUrl)
print(f'图片名字:{t}----------图片地址{itemUrl}')
r = requests.get(itemUrl, headers=headers).content
# print(r)
with open(f'美女/{t}.jpg','wb') as f:
f.write(r)
time.sleep(1 + random.random())
length = len(title)
print(f'第{page}页爬取完成,共有{length}张图片')


if __name__ == '__main__':
print('爬虫开始干活了')
if not os.path.exists('美女'):
os.makedirs('美女')
print('创建文件夹成功')
main()

爬取酷狗音乐

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import requests
import time
import random

from lxml import etree


def main():
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
'cookie': 'kg_mid=2d94fc2f6b14d23d5a9d31c5bbbb6d18; kg_dfid=3x87US2eiEz443jt7n29R06x; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e'
}

url1 = 'https://www.kugou.com/yy/rank/home/1-6666.html?from=rank'
r = requests.get(url1, headers=headers).text
html = etree.HTML(r)
infoUrls = html.xpath('//*[@id="rankWrap"]//ul/li/@data-eid')
for url in infoUrls:
url = f'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&mid=9902bb7f8dcb720fd6fa063c8d06f45e&encode_album_audio_id={url}'
r = requests.get(url, headers=headers)
name = r.json()['data']['album_name']
playUrl = r.json()['data']['play_backup_url']
print(f'音乐标题:{name}-----------音乐链接:{playUrl}')
musicLoad(name,playUrl,headers) #调用函数
time.sleep(1 + random.random())


def musicLoad(name,playUrl,headers): #下载音乐的函数
with open(f'酷狗音乐/{name}.mp3','wb') as f:
f.write(requests.get(playUrl,headers=headers).content)


if __name__ == '__main__': #入口函数
print("爬虫启动")
if not os.path.exists('酷狗音乐'):
os.makedirs('酷狗音乐')
main()

爬取喵御宅

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import os.path
import csv
import mysql.connector
from lxml import etree

db_config={
'host': 'localhost',
'user': 'root',
'password': '196691',
'database': 'ry-vue',
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
def get_data(url):
r = requests.get(url,headers=headers)
r.encoding="utf-8"
html = etree.HTML(r.text)
# 提取图片链接
imgs = html.xpath('//div[@class="m-v__posts"][1]/figure/span/img/@data-src')
# 提取标题
titles = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/a/@title')
# 提取作者
authors = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[1]/div/text()')
# 提取点赞数
nums = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[2]/div[1]/text()')
# 提取发布时间
times = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[2]/div[2]/text()')
# 处理图片链接
for i,item in enumerate(imgs,start=0):
img = "https://www.mfuns.net/"+ item
# print(img)
url_parts = img.split('?')
img_url = url_parts[0]
imgs[i] = img_url
print(imgs)
print(titles)
print(authors)
print(nums)
print(times)
return imgs,titles,authors,nums,times
def save_data(imgs,titles,authors,nums,times,filename):
# 保存为csv文件
with open(filename,'w',encoding="utf-8",newline='') as f:
writer = csv.writer(f)
writer.writerow(['标题','作者','播放数量','发布时间','图片链接'])
writer.writerows(zip( titles,authors,nums,times,imgs))
print("csv文件写入完成")
# 图片保存本地
if not os.path.exists('demo1'):
os.mkdir('demo1')

total_img = len(imgs)
for i ,item in enumerate(imgs,start=1):
with open(f'demo1/{i}.jpg','wb') as f:
data = requests.get(item,headers=headers).content
f.write(data)
# print(item)
print(f'图片{i}')
print(f'下载图片结束,共有:{total_img}张图片被下载')

# 图片保存mysql
def save_mysql(imgs,titles,authors,nums,times):
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# 创建表
create_table_query = """
CREATE TABLE IF NOT EXISTS posts (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
author VARCHAR(255),
play_count INT,
publish_time varchar(255),
image_url VARCHAR(255)
)
"""
cursor.execute(create_table_query)
# 插入数据
# sql语句
insert_query = """
INSERT INTO posts (title, author, play_count, publish_time, image_url)
VALUES (%s, %s, %s, %s, %s)
"""
insert_list = list(zip(titles,authors,nums,times,imgs))
cursor.executemany(insert_query,insert_list)
# 提交更改
conn.commit()
cursor.close()
print("数据已成功存入MySQL数据库。")

if __name__ == '__main__':
url = 'https://www.mfuns.net/category/30'
imgs,titles,authors,nums,times = get_data(url)
save_mysql(imgs,titles,authors,nums,times)
save_data(imgs,titles,authors,nums,times,'demo1.csv')

图片

内容

数据库