python抓取豆瓣top250影片信息

挑水做饭 2020年07月27日 15次浏览

产品甩过来一个链接,问我有没有办法,提取豆瓣id,这怎么能说不行,整了个简单的python脚本,将抓到的信息导出到excel,抓取豆瓣top250影片信息代码如下:


from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

k = 0
n = 1
movieData250 = []


# 读取每一个网页25个电影信息
def info25():
    movieData = []
    for i in range(0, 25):
        name = movie[i].find('span', class_="title").string  # 获得影片名称
        score = movie[i].find('span', class_="rating_num").string  # 获得影片评分
        # num = movie[i].find('div', class_="star").find_all('span')[-1].string.strip('人评价')  # 获得影片评价人数
        quote = movie[i].find('span', class_="inq")  # 获得影片短评
        if quote is None:
            quote = "暂无"
        else:
            quote = quote.string
        address = movie[i].find('div', attrs={'class': "hd"}).find_all('a')[-1]["href"]
        movieData.append([i + 1 + k, name, score, quote, address,
                          str(address).replace("https://movie.douban.com/subject/", "").replace("/", "")])
    # print(movieData)
    return movieData
    # movieData250 = movieData250 + movieData


while (k == 0):
    h = "https://movie.douban.com/top250"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')  # 改变标准输出的默认编码
    #  构造第i页的网址
    url = 'https://movie.douban.com/top250'
    #  发送请求,获得返回的html代码并保存在变量html中
    html_data = requests.get(url, headers=headers).text
    soup = bs(html_data, 'lxml')
    # print(soup.prettify())
    # movieList=soup.find('ol')#寻找第一个ol标签,得到所有电影
    # movieList=soup.find('ol',class_="grid_view")#以下两种方法均可
    movieList = soup.find('ol', attrs={'class': "grid_view"})
    movie = movieList.find_all('li')  # 获取每一个li(每个li是一个电影),以数组方式
    movieData250 += info25()
    k += 25

while (k < 250):
    h = "https://movie.douban.com/top250?start=" + str(k) + "&filter="
    html_data = requests.get(h, headers=headers).text
    soup = bs(html_data, 'lxml')
    # print(soup.prettify())
    # movieList=soup.find('ol')#寻找第一个ol标签,得到所有电影
    # movieList=soup.find('ol',class_="grid_view")#以下两种方法均可
    movieList = soup.find('ol', attrs={'class': "grid_view"})
    movie = movieList.find_all('li')  # 获取每一个li(每个li是一个电影),以数组方式
    movieData250 += info25()
    k += 25

print(movieData250)

import codecs

name = ['排名', '片名', '评分', '短评', '地址', '豆瓣id']
csv_list = pd.DataFrame(columns=name, data=movieData250)
csv_list.to_csv('top250.csv', encoding='gbk')