编程知识 cdmana.com

How to use regular expressions to crawl data between start Tags

Climb to the scenic spot of which net

# -*- codeing = utf-8 -*-from bs4 import BeautifulSoup  #  Web page parsing , get data import re  #  Regular expressions , Text matching `import urllib.request, urllib.error  #  To develop URL, Get web data import xlwt  #  Conduct excel operation findTitle = re.compile(r'<span class="cn_tit">(.*?)</span>')findRating = re.compile(r'<span class="ranking_sum">(.*?)</span>')findInq = re.compile(r'<div class="desbox">(.*?)</div>')def main():    baseurl = "https://travel.qunar.com/p-cs299914-beijing-jingdian-1-"  #  Web links to crawl     datalist = getData(baseurl)    savepath = " Beijing attractions .xls"  #  Create a new directory for the current directory XLS, Store in     saveData(datalist, savepath)#  Crawl to the web def getData(baseurl):    datalist = []  #  Used to store crawled web page information     for i in range(0, 20):  #  Call the function to get page information ,10 Time         url = baseurl + str(i + 1)        html = askURL(url)        soup = BeautifulSoup(html, "html.parser")        for item in soup.find_all('li', class_="item"):  #  Find a string that meets the requirements             data = []  #  Save all the information about a movie             item = str(item)            titles = re.findall(findTitle, item)            data.append(titles)            rating = re.findall(findRating, item)            data.append(rating)            inq = re.findall(findInq, item)            data.append(inq)            datalist.append(data)    return datalist#  Get a specified URL The web content of def askURL(url):    head = {  #  Simulate browser header information , Send a message to Douban server         "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"    }    request = urllib.request.Request(url, headers=head)    html = ""    try:        response = urllib.request.urlopen(request)        html = response.read().decode("utf-8")    except urllib.error.URLError as e:        if hasattr(e, "code"):            print(e.code)        if hasattr(e, "reason"):            print(e.reason)    return html#  Save data to table def saveData(datalist, savepath):    print("save.......")    book = xlwt.Workbook(encoding="utf-8", style_compression=0)  #  establish workbook object     sheet = book.add_sheet(' Beijing attractions ', cell_overwrite_ok=True)  #  Create sheet     col = (" Name of the scenic spot ", " ranking ", " Introduce ")    for i in range(0, 3):        sheet.write(0, i, col[i])  #  Name     for i in range(0, 200):        # print(" The first %d strip " %(i+1)) # Output statement , For testing         data = datalist[i]        for j in range(0, 3):            sheet.write(i + 1, j, data[j])  #  data     book.save(savepath)  #  preservation if __name__ == "__main__":  #  When the program is executed     main()    print(" Crawling over !")

There will always be extra labels for the data crawled out




Refer to the answer 1:

And why isn't the data crawling out 200 individual ?




Refer to the answer 2:

版权声明
本文为[CSDN Q & A]所创,转载请带上原文链接,感谢
https://cdmana.com/2021/12/202112121740428053.html

Scroll to Top