Warm tip: This article is reproduced from serverfault.com, please click

BeautifulSoup find.all() web scraping returns empty

发布于 2020-11-27 18:39:39

When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.

I've used similar code to scrape other websites, so why does this code not work correctly?

Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

from time import sleep
from random import randint
from googletrans import Translator

translator = Translator()

rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []

START_PAGE = 1
END_PAGE = 42


for page in range(START_PAGE, END_PAGE + 1):

    page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))

    page.encoding = page.apparent_encoding

    if not page:
        pass

    else:

        soup = BeautifulSoup(page.text, 'html.parser')
    
        tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')

        sleep(randint(2,10))
        
        for container in tbody:

            col1 = container.find_all('tr', {'data-id':'26079'})
            for info in col1:
                col_1 = info.find_all('td')
                for data in col_1:
                    party = data[0]
                    party_trans = translator.translate(party)
                    pty_n.append(party_trans)

                    pvotes = data[1]
                    pv1.append(pvotes)

                    pshare = data[2]
                    pvs1.append(pshare)

                    mandates = data[3]
                    seat.append(mandates)

            col2 = container.find_all('tr', {'data-id':'26075'})
            for info in col2:
                col_2 = info.find_all('td')
                for data in col_2:
                    party2 = data[0]
                    party_trans2 = translator.translate(party2)
                    pty_n.append(party_trans2)

                    pvotes2 = data[1]
                    pv1.append(pvotes2)

                    pshare2 = data[2]
                    pvs1.append(pshare2)

                    mandates2 = data[3]
                    seat.append(mandates2)

            col3 = container.find_all('tr', {'data-id':'26063'})
            for info in col3:
                col_3 = info.find_all('td')
                for data in col_3:
                    party3 = data[0].text
                    party_trans3 = translator.translate(party3)
                    pty_n.extend(party_trans3)

                    pvotes3 = data[1].text
                    pv1.extend(pvotes3)

                    pshare3 = data[2].text
                    pvs1.extend(pshare3)

                    mandates3 = data[3].text
                    seat.extend(mandates3)

            col4 = container.find_all('tr', {'data-id':'26091'})
            for info in col4:
                col_4 = info.find_all('td',recursive=True)
                for data in col_4:
                    party4 = data[0]
                    party_trans4 = translator.translate(party4)
                    pty_n.extend(party_trans4)

                    pvotes4 = data[1]
                    pv1.extend(pvotes4)

                    pshare4 = data[2]
                    pvs1.extend(pshare4)

                    mandates4 = data[3]
                    seat.extend(mandates4)

            col5 = container.find_all('tr', {'data-id':'26073'})
            for info in col5:
                col_5 = info.find_all('td')
                for data in col_5:
                    party5 = data[0]
                    party_trans5 = translator.translate(party5)
                    pty_n.extend(party_trans5)

                    pvotes5 = data[1]
                    pv1.extend(pvotes5)

                    pshare5 = data[2]
                    pvs1.extend(pshare5)

                    mandates5 = data[3]
                    seat.extend(mandates5)

            col6 = container.find_all('tr', {'data-id':'26080'})
            for info in col6:
                col_6 = info.find_all('td')
                for data in col_6:
                    party6 = data[0]
                    party_trans6 = translator.translate(party6)
                    pty_n.extend(party_trans6)

                    pvotes6 = data[1]
                    pv1.extend(pvotes6)

                    pshare6 = data[2]
                    pvs1.extend(pshare6)

                    mandates6 = data[3]
                    seat.extend(mandates6)
            
                
        #### TOTAL  VOTES ####
        tfoot = soup.find_all('tfoot')
        for data in tfoot:
            fvote = data.find_all('td')
            for info in fvote:
                votefinal = info.find(text=True).get_text()
                fvoteindiv = [votefinal]
                fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
                vot1.extend(fvotelist)
            
        #### CONSTITUENCY NAMES ####
        constit = soup.find_all('a', class_='btn btn-link last')
        for data in constit:
            names = data.get_text()
            names_clean = names.replace("Sejum Constituency no.","")
            names_clean2 = names_clean.replace("[","")
            names_clean3 = names_clean2.replace("]","")
            namesfinal = names_clean3.split()[1]
            constitindiv = [namesfinal]
            constitlist = constitindiv * (len(pty_n) - len(cst_n))
            cst_n.extend(constitlist)

        #### UNSCRAPABLE INFO ####
        region = 'Europe'
        reg2 = [region]
        reglist = reg2 * (len(pty_n) - len(rg))
        rg.extend(reglist)

        country = 'Poland'
        ctr2 = [country]
        ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
        ctr_n.extend(ctrlist)

        year = '2019'
        yr2 = [year]
        yrlist = yr2 * (len(pty_n) - len(yr))
        yr.extend(yrlist)

        month = '10'
        mo2 = [month]
        molist = mo2 * (len(pty_n) - len(mn))
        mn.extend(molist)

        codes = ''
        codes2 = [codes]
        codeslist = codes2 * (len(pty_n) - len(manual))
        manual.extend(codeslist)

        noinfo = '-990'
        noinfo2 = [noinfo]
        noinfolist = noinfo2 * (len(pty_n) - len(no_info))
        no_info.extend(noinfolist)

        print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))

    

poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})

print(poland19)

poland19.to_csv('poland_19.csv')
Questioner
Sara
Viewed
0
ScrapistA 2020-11-28 07:51:06

As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:

from selenium import webdriver


wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')

You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/

Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason. Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:

tbody = soup.find_all('table', id="DataTables_Table_0")

And again, by doing the get requests with the selenium lib. Hope that was helpful :) Cheers