Ok so I'm new to webscraping and python and have a sentiment analysis project to do.

So I am trying to scrape multiple pages of reviews from multiple pages of dramas from mydramalist.com. ie. most popular dramas on mydramalist.com/shows/popular (pages 1-5), and their top 20 reviews which can be found when you click on the drama and then click reviews (pages 1-3).

I have managed to do them separately; one code for scraping the first review for each drama, and one code for scraping multiple page reviews for one drama. But I want to combine it so I get multiple dramas and multiple reviews. is this possible?

Here's my code for scraping the top review from each drama (pages 1-6):

baseurl = 'https://mydramalist.com/'

productlinks = []
for x in range(1,6):
    r = requests.get(f'https://mydramalist.com/shows/popular?page={x}')
    soup = BeautifulSoup(r.content, 'html.parser')
    productlist = soup.find_all('h6', class_='text-primary title')
    for item in productlist:
        for link in item.find_all('a', href=True):
            productlinks.append(baseurl + link['href']+'/reviews?page=1')

reviewlist = []
for link in productlinks:
    r = requests.get(link)
    soup = BeautifulSoup(r.content, 'html.parser')

    name = soup.find('h1', class_='film-title').text
    rating = float(soup.find('span', {'class': "score pull-right"}).text)
    body = soup.find('div', {'class': 'col-sm-12 review-body'}).text.strip()
    review = {
        'name': name,
        'rating': rating,
        'body': body
    }
    reviewlist.append(review)

And here's my code for multiple pages of reviews for just one drama:

url = "https://mydramalist.com/18452-goblin/reviews?page=1"

def get_soup(url):
    r = requests.get(link)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

reviewlist = []
def get_reviews(soup):
    reviews = soup.find_all('div', class_= 'review')
    try:
        for review in reviews:
            review = {
            'drama': soup.title.text,
            'rating': float(review.find('span', {'class': "score pull-right"}).text),
            'body': review.find('div', {'class': 'col-sm-12 review-body'}).text.strip(),
            }
            reviewlist.append(review)
    except:
        pass

def getnextpage(soup):
    page = soup.find('ul', {'class':'pagination'})
    if page.find('li', {'class': 'page-item next'}):
        url = 'https://mydramalist.com' + str(page.find('li',{'class':'page-item next'}).find('a')['href'])
        return url
    else:
        return

while True:
    soup= get_soup(url)
    get_reviews(soup)
    url = getnextpage(soup)
    print(len(reviewlist))
    if not url:
           break

I feel like I'm almost there with my code, I just can't configure it to work how I want it :( Any help is much appreciated!

🔴 No definitive solution yet