当当网排行榜爬虫+nosql数据分析
资源内容介绍
爬取当当网排行榜并连接本地nosql数据库进行多维度数据分析 from lxml import etreeimport requestsimport randomfrom pymongo import MongoClientimport timeimport csvimport pandas as pdimport numpy as npimport reimport osimport pymongocookies = { 'ddscreen': '2', 'ddscreen': '2', 'dest_area': 'country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0', '__permanent_id': '20240423210658530124490989268736883', 'MDD_channelId': '70000', 'MDD_fromPlatform': '307', 'ddscreen': '2', '__visit_id': '20240530154038979262380281306734049', '__out_refer': '', '__rpm': '...1717054859559%7C...1717054899777', '__trace_id': '20240530154142377181404279783243769',}headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie': 'ddscreen=2; ddscreen=2; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; __permanent_id=20240423210658530124490989268736883; MDD_channelId=70000; MDD_fromPlatform=307; ddscreen=2; __visit_id=20240530154038979262380281306734049; __out_refer=; __rpm=...1717054859559%7C...1717054899777; __trace_id=20240530154142377181404279783243769', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',}def SpiderData(url): response = requests.get( url, cookies=cookies, headers=headers, verify=False, ) page_text = response.text return page_textdef ParseData(page_text,key): tree = etree.HTML(page_text) lis = tree.xpath('.//ul[@class="bang_list clearfix bang_list_mode"]/li') rank, name, comments, recommends, author, publish_date, publish_house, original_price, discount_price, discount, ebook_price = [[] for i in range(11)] for li in lis: rank.append(''.join(li.xpath('.//div[@class="list_num red" or @class="list_num "]/text()')).replace('.','')) name.append(''.join(li.xpath('.//div[@class="name"]/a/text()'))) comments.append(''.join(li.xpath('.//div[@class="star"]/a/text()')).split('条')[0]) recommends.append(''.join(li.xpath('.//div[@class="star"]/span/text()')).split('推荐')[0]) author.append(''.join(li.xpath('.//div[@class="publisher_info"][1]/a[1]/text()'))) publish_date.append(''.join(li.xpath('.//div[@class="publisher_info"][2]/span/text()'))) publish_house.append(''.join(li.xpath('.//div[@class="publisher_info"][2]/a/text()'))) original_price.append(''.join(li.xpath('.//div[@class="price"]/p[1]/span[1]/text()')).replace("¥", "")) discount_price.append(''.join(li.xpath('.//span[@class="price_r"]/text()')).replace("¥", "")) discount.append(''.join(li.xpath('.//span[@class="price_s"]/text()'))) ebook_price.append(''.join(li.xpath('./div[@class="price"]/p[@class="price_e"]/span[@class="price_n"]/text()')).replace("¥", "")) # print(original_price) #print(len(rank),len(name),len(comments),len(recommends),len(author),len(publish_date),len(publish_date),len(original_price),len(discount_price),len(discount)) # 保存数据 dic = { '排行榜类型':key, '排序':rank, '书名':name, '评论数':comments, '推荐值':recommends, '作者':author, '出版日期':publish_date, '出版社':publish_house, '原价':original_price, '折扣价':discount_price, '折扣比例':discount, '电子书价格':ebook_price } df1 = pd.DataFrame(dic) return df1if __name__ == "__main__": # 创建空数据框 columns = ['排行榜类型', '排序', '书名', '评论数', '推荐值', '作者', '出版日期', '出版社', '原价','折扣价', '折扣比例', '电子书价格'] df = pd.DataFrame(columns=columns) book_rank_type = { "2020年": "year-2020-0-1", "2021年": "year-2021-0-1", "2022年": "year-2022-0-1", "2023年": "year-2023-0-1" } # 循环爬取 for key, value in book_rank_type.items(): print(f'=====================开始爬{key}榜单数据===================') for page in range(25): # 排行榜共有25页数据 print('*****************开始爬取第{}页数据*****************'.format(page+1)) url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-{value}-{page+1}' #print(url) # try: time.sleep(1) data = SpiderData(url) df1 = ParseData(data, key) k = len(df) df = pd.concat([df,df1], axis=0) df.drop_duplicates(subset=None, keep='first', inplace=True) print('*********第{}页数据爬取完成,爬取{}条数据,目前共爬取{}条数据**********'.format(page+1,len(df)-k,len(df))) # except: # print('!!!!!!!!!第{}页数据爬取有误,需进行优化!!!!!!!!!'.format(page+1)) # break df = df.reset_index(drop=True) print(f'=================={key}榜单数据爬取完成,共有{len(df)}条数据===================') # # 连接到MongoDB,假设运行在默认端口上 # client = MongoClient('localhost', 27017) # # 选择数据库,如果不存在则会自动创建 # db = client['dangdang_4213'] # # 选择集合,如果不存在则会自动创建 # collection = db['dangdang_4213'] # # 将DataFrame转换为字典列表 # list_of_dicts = df.to_dict('records') # # 将数据插入MongoDB # collection.insert_many(list_of_dicts) # print(f'数据已成功存储到MongoDB,共{len(list_of_dicts)}条记录。') # # 关闭MongoDB连接 # client.close() # print(f'数据库连接已关闭')# print(df.shape) # 打印DataFrame的形状# print(df.head()) # 打印DataFrame的前几行数据df.to_excel('当当网近4年畅销图书榜单数据.xlsx',header=True,index=False)