1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
| import requests import os.path import csv import mysql.connector from lxml import etree
db_config={ 'host': 'localhost', 'user': 'root', 'password': '196691', 'database': 'ry-vue', }
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', } def get_data(url): r = requests.get(url,headers=headers) r.encoding="utf-8" html = etree.HTML(r.text) # 提取图片链接 imgs = html.xpath('//div[@class="m-v__posts"][1]/figure/span/img/@data-src') # 提取标题 titles = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/a/@title') # 提取作者 authors = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[1]/div/text()') # 提取点赞数 nums = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[2]/div[1]/text()') # 提取发布时间 times = html.xpath('//div[@class="m-v__posts"][1]/figure/figcaption/div[2]/div[2]/text()') # 处理图片链接 for i,item in enumerate(imgs,start=0): img = "https://www.mfuns.net/"+ item # print(img) url_parts = img.split('?') img_url = url_parts[0] imgs[i] = img_url print(imgs) print(titles) print(authors) print(nums) print(times) return imgs,titles,authors,nums,times def save_data(imgs,titles,authors,nums,times,filename): # 保存为csv文件 with open(filename,'w',encoding="utf-8",newline='') as f: writer = csv.writer(f) writer.writerow(['标题','作者','播放数量','发布时间','图片链接']) writer.writerows(zip( titles,authors,nums,times,imgs)) print("csv文件写入完成") # 图片保存本地 if not os.path.exists('demo1'): os.mkdir('demo1')
total_img = len(imgs) for i ,item in enumerate(imgs,start=1): with open(f'demo1/{i}.jpg','wb') as f: data = requests.get(item,headers=headers).content f.write(data) # print(item) print(f'图片{i}') print(f'下载图片结束,共有:{total_img}张图片被下载') # 图片保存mysql def save_mysql(imgs,titles,authors,nums,times): conn = mysql.connector.connect(**db_config) cursor = conn.cursor() # 创建表 create_table_query = """ CREATE TABLE IF NOT EXISTS posts ( id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(255), author VARCHAR(255), play_count INT, publish_time varchar(255), image_url VARCHAR(255) ) """ cursor.execute(create_table_query) # 插入数据 # sql语句 insert_query = """ INSERT INTO posts (title, author, play_count, publish_time, image_url) VALUES (%s, %s, %s, %s, %s) """ insert_list = list(zip(titles,authors,nums,times,imgs)) cursor.executemany(insert_query,insert_list) # 提交更改 conn.commit() cursor.close() print("数据已成功存入MySQL数据库。")
if __name__ == '__main__': url = 'https://www.mfuns.net/category/30' imgs,titles,authors,nums,times = get_data(url) save_mysql(imgs,titles,authors,nums,times) save_data(imgs,titles,authors,nums,times,'demo1.csv')
|