Python爬虫学习-简单爬取网页数据

# coding = utf-8
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'}
url = "https://jbk.39.net/mxyy/jbzs/"
r = requests.get(url, headers = headers)
html = r.content.decode('utf-8', 'ignore')
my_page = BeautifulSoup(html, 'lxml')

for tag in my_page.find_all('div', class_='disease'):   
    disease = tag.find('h1').get_text()
    disease_name = disease

for tag in my_page.find_all('p', class_='introduction'):
    introduction = tag.get_text()
    disease_introduction = introduction

for tag in my_page.find_all('div', class_='list_left'):
    sub_tag = tag.find('ul',class_="disease_basic") 
    my_span = sub_tag.findAll('span')
    #my_span is a list
    is_yibao = my_span[1].text    #是否医保
    othername = my_span[3].text   #别名
    fbbw = my_span[5].text        #发病部位
    is_infect = my_span[7].text   #传染性
    dfrq = my_span[9].text        #多发人群
    my_a = sub_tag.findAll('a')
    xgzz = my_a[2].text+' '+my_a[3].text+' '+my_a[4].text  #相关症状
    #ps: .contents[0] or .get_text() is also accepted

# Some tests:
# print(html)
# print(my_page)
# print(sub_tag)
# print(xgzz)
# print(my_span)
# print(my_span[1])

评论