你的浏览器不支持canvas

墨染半纸,清心煮字

Python爬取去哪儿网信息

Date: Author: 吕雄

本文章采用 知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议 进行许可。转载请注明来自吕雄

详细代码如下:


import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import urllib.parse
import json
import csv


# 读取上次已爬取的CSV文件,以便断点续爬
def init_df():
    global df_sights
    try:
        df_sights = pd.read_csv("D:/qunaer_sights.csv")
    except:
        df_sights = pd.DataFrame()


# 初始化CSV writer,第一次需写入列名
def init_csv():
    global f
    global writer
    global df_sights
    csvFile = "D:/qunaer_sights.csv"
    f = open(csvFile, "a+", newline="", encoding='utf-8')
    writer = csv.writer(f)
    if df_sights.columns.empty:
        writer.writerow(["景点名", "等级", "地址", "介绍", "热度", "价格", "月销量", "经度", "纬度"])


# 程序结束前关闭CSV文件
def close_csv():
    global f
    f.close()


# 调用百度地图API获取景点地址对应的经纬度
def get_geo_info(address):
    geo_url = "http://api.map.baidu.com/geocoder/v2/?"
    geo_params = {
        "output": "json",
        "ak": "tAwAffII9G0F8Gs4VRuXnuFmIdbOKCEu" # 替换成你的百度地图API Key
    }

    # 更新url中的地址参数
    geo_params.update({"address": address})
    data = urllib.parse.urlencode(geo_params)
    cur_geo_url = geo_url + data
    geo_resp = requests.get(cur_geo_url)

    json_data = json.loads(geo_resp.text)

    # 调用成功, 获取JSON data中的经纬度信息
    if json_data["status"] == 0:
        longitude = json_data["result"]["location"]["lng"]
        latitude = json_data["result"]["location"]["lat"]
    else:
        longitude = ""
        latitude = ""

    return longitude, latitude


# 抓取去哪儿网热门景点销量信息
def dump_qunaer_sights(pages):
    global df_sights
    global writer
    base_url = "http://piao.qunar.com/ticket/list.htm?keyword=热门景点&page="

    for i in range(pages):
        print("page:{0}".format(i + 1))
        url = base_url + str(i + 1)
        resp = requests.get(url)
        time.sleep(random.uniform(1, 3))

        # 通过BeautifulSoup解析当前页面HTML,获取景点列表信息
        soup = BeautifulSoup(resp.text, 'lxml')
        sight_list = soup.select('.sight_item_detail')

        for sight in sight_list:
            # 获取景点名
            name = sight.select('.name')[0].text
            # 如该景点已存在CSV文件中,则跳过该页,继续爬取下一页(断点续爬)
            if not df_sights.empty and not df_sights[df_sights["景点名"] == name].empty:
                break

            # 获取景点等级
            try:
                level = sight.select('.level')[0].text.replace("景区", "")
            except:
                level = ""

            # 获取景点地址
            address = sight.select('.address.color999 span')[0].text.replace("地址:", "")

            # 获取景点介绍
            intro = sight.select('.intro.color999')[0].text

            # 获取景点热度
            star = sight.select('.product_star_level em span')[0].text.replace("热度 ", "")

            # 获取门票价格,月销量
            try:
                price = sight.select('.sight_item_price em')[0].text
                sales = sight.select('.hot_num')[0].text
            except:
                continue

            # 将景点地址转换为经纬度
            longitude, latitude = get_geo_info(address)

            # 向CSV文件中插入一条景点信息
            sight_item = [name, level, address, intro, star, price, sales, longitude, latitude]
            print(sight_item)
            writer.writerow(sight_item)


if __name__ == "__main__":
    init_df()
    init_csv()
    dump_qunaer_sights(pages=400)
    close_csv()
    
    

墨染半纸,清心煮字...