diff --git a/ZeitOnlineRequests.py b/ZeitOnlineRequests.py index a1144ef..8978ff1 100644 --- a/ZeitOnlineRequests.py +++ b/ZeitOnlineRequests.py @@ -1,9 +1,12 @@ # -*- coding: utf-8 -*- import requests +import csv +import os from bs4 import BeautifulSoup base_url = 'https://www.zeit.de/serie/die-kaenguru-comics?p=' - +csv_file = open('output.csv', 'w', newline='') +csv_file_writer = csv.writer(csv_file, dialect='excel') for page in range(1, 15): page_url = base_url + str(page) @@ -15,8 +18,26 @@ for page in range(1, 15): comic = dict() target_page = requests.get(article.a.get('href')) target_soup = BeautifulSoup(target_page.text, 'html.parser') - comic['title'] = target_soup.find('span', class_="article-heading__title") + comic['title'] = target_soup.find('span', class_="article-heading__title").string comic['time'] = target_soup.find('time', class_="metadata__date").get('datetime') - comic['number'] = target_soup.find('span', class_="article-heading__kicker") - comic['image_url'] = target_soup.find(class_="scrollable-media-container").img.get('src') - print(comic) \ No newline at end of file + comic['number'] = target_soup.find('span', class_="article-heading__kicker").string.replace("Folge ", '') + media_container = target_soup.find(class_="scrollable-media-container") + if media_container: + comic['image_url'] = media_container.img.get('src') + else: + media_container = target_soup.find(class_="collapsible-image__figure") + if media_container: + comic['image_url'] = media_container.img.get('src') + else: + print('media_container not found') + + csv_file_writer.writerow(comic) + + image_download = requests.get(comic['image_url']) + if os.path.exists(comic['number']+'.webp'): + comic['number'] += '_' + file = open(comic['number']+'.webp', 'xb') + file.write(image_download.content) + file.close() + print(comic) +csv_file.close() \ No newline at end of file