|
|
@ -1,9 +1,12 @@
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import requests
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import os
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
base_url = 'https://www.zeit.de/serie/die-kaenguru-comics?p='
|
|
|
|
base_url = 'https://www.zeit.de/serie/die-kaenguru-comics?p='
|
|
|
|
|
|
|
|
csv_file = open('output.csv', 'w', newline='')
|
|
|
|
|
|
|
|
csv_file_writer = csv.writer(csv_file, dialect='excel')
|
|
|
|
|
|
|
|
|
|
|
|
for page in range(1, 15):
|
|
|
|
for page in range(1, 15):
|
|
|
|
page_url = base_url + str(page)
|
|
|
|
page_url = base_url + str(page)
|
|
|
@ -15,8 +18,26 @@ for page in range(1, 15):
|
|
|
|
comic = dict()
|
|
|
|
comic = dict()
|
|
|
|
target_page = requests.get(article.a.get('href'))
|
|
|
|
target_page = requests.get(article.a.get('href'))
|
|
|
|
target_soup = BeautifulSoup(target_page.text, 'html.parser')
|
|
|
|
target_soup = BeautifulSoup(target_page.text, 'html.parser')
|
|
|
|
comic['title'] = target_soup.find('span', class_="article-heading__title")
|
|
|
|
comic['title'] = target_soup.find('span', class_="article-heading__title").string
|
|
|
|
comic['time'] = target_soup.find('time', class_="metadata__date").get('datetime')
|
|
|
|
comic['time'] = target_soup.find('time', class_="metadata__date").get('datetime')
|
|
|
|
comic['number'] = target_soup.find('span', class_="article-heading__kicker")
|
|
|
|
comic['number'] = target_soup.find('span', class_="article-heading__kicker").string.replace("Folge ", '')
|
|
|
|
comic['image_url'] = target_soup.find(class_="scrollable-media-container").img.get('src')
|
|
|
|
media_container = target_soup.find(class_="scrollable-media-container")
|
|
|
|
|
|
|
|
if media_container:
|
|
|
|
|
|
|
|
comic['image_url'] = media_container.img.get('src')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
media_container = target_soup.find(class_="collapsible-image__figure")
|
|
|
|
|
|
|
|
if media_container:
|
|
|
|
|
|
|
|
comic['image_url'] = media_container.img.get('src')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print('media_container not found')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
csv_file_writer.writerow(comic)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_download = requests.get(comic['image_url'])
|
|
|
|
|
|
|
|
if os.path.exists(comic['number']+'.webp'):
|
|
|
|
|
|
|
|
comic['number'] += '_'
|
|
|
|
|
|
|
|
file = open(comic['number']+'.webp', 'xb')
|
|
|
|
|
|
|
|
file.write(image_download.content)
|
|
|
|
|
|
|
|
file.close()
|
|
|
|
print(comic)
|
|
|
|
print(comic)
|
|
|
|
|
|
|
|
csv_file.close()
|