From a027fbf7af1f96dc26fe5a610525ae52bcc40c28 Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 4 May 2021 01:48:51 -0700 Subject: [PATCH] Rewrite extract_item and extract_items functions This commit completely rewrites the extract_item and extract_items function. Before this commit these two function were an unreadable mess. The extract_item function was a lengthy if-elsif chain while the extract_items function contained an incomprehensible mess of .try, else and ||. With this commit both of these functions have been pulled into a separate file with the internal logic being moved to a few classes. This significantly reduces the size of these two methods, enhances readability and makes adding new extraction/parse rules much simpler. See diff for details. --- src/invidious/channels.cr | 11 +- src/invidious/featured_channels.cr | 24 +-- src/invidious/helpers/extractors.cr | 320 ++++++++++++++++++++++++++++ src/invidious/helpers/helpers.cr | 166 +-------------- src/invidious/helpers/macros.cr | 2 +- 5 files changed, 341 insertions(+), 182 deletions(-) create mode 100644 src/invidious/helpers/extractors.cr diff --git a/src/invidious/channels.cr b/src/invidious/channels.cr index 8b03a6f2..0018e5c9 100644 --- a/src/invidious/channels.cr +++ b/src/invidious/channels.cr @@ -380,12 +380,12 @@ def fetch_channel_playlists(ucid, author, continuation, sort_by) return items, continuation end -def fetch_channel_featured_channels(ucid, tab_data, params = nil, continuation = nil, title = nil ) +def fetch_channel_featured_channels(ucid, tab_data, params = nil, continuation = nil, title = nil) if continuation.is_a?(String) - initial_data = request_youtube_api_browse(continuation) - channels_tab_content = initial_data["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"] + initial_data = request_youtube_api_browse(continuation) + channels_tab_content = initial_data["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"] - return process_featured_channels([channels_tab_content,], nil, title, continuation_items=true) + return process_featured_channels([channels_tab_content], nil, title, continuation_items = true) else if params.is_a?(String) initial_data = request_youtube_api_browse(ucid, params) @@ -908,7 +908,7 @@ def get_about_info(ucid, locale) country = "" total_views = 0_i64 joined = Time.unix(0) - tabs = {} of String => Tuple(Int32, String) # TabName => {TabiZZndex, browseEndpoint params} + tabs = {} of String => Tuple(Int32, String) # TabName => {TabiZZndex, browseEndpoint params} links = [] of {String, String, String} tabs_json = initdata["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]?.try &.as_a? @@ -964,7 +964,6 @@ def get_about_info(ucid, locale) tab_names << node["tabRenderer"]["title"].as_s.downcase tab_data << {i, node["tabRenderer"]["endpoint"]["browseEndpoint"]["params"].as_s} end - end tabs = Hash.zip(tab_names, tab_data) end diff --git a/src/invidious/featured_channels.cr b/src/invidious/featured_channels.cr index 0b44aeae..e1486403 100644 --- a/src/invidious/featured_channels.cr +++ b/src/invidious/featured_channels.cr @@ -89,16 +89,16 @@ def _extract_channel_data(channel) end FeaturedChannel.new({ - author: author, - ucid: ucid, + author: author, + ucid: ucid, author_thumbnail: author_thumbnail, subscriber_count: subscriber_count, - video_count: video_count, - description_html: description_html + video_count: video_count, + description_html: description_html, }) end -def process_featured_channels(data, submenu_data, title=nil, continuation_items=false) +def process_featured_channels(data, submenu_data, title = nil, continuation_items = false) all_categories = [] of Category if submenu_data.is_a?(Bool) @@ -119,17 +119,17 @@ def process_featured_channels(data, submenu_data, title=nil, continuation_items= raw_category["content"]["horizontalListRenderer"]["items"].as_a.each do |channel| contents << _extract_channel_data(channel["gridChannelRenderer"]) end - # Single channel + # Single channel else channel = raw_category["content"]["expandedShelfContentsRenderer"]["items"][0]["channelRenderer"] contents = _extract_channel_data(channel) end all_categories << Category.new({ - title: category_title, - contents: contents, + title: category_title, + contents: contents, browse_endpoint_param: browse_endpoint_param, - continuation_token: nil + continuation_token: nil, }) end else @@ -159,10 +159,10 @@ def process_featured_channels(data, submenu_data, title=nil, continuation_items= end all_categories << Category.new({ - title: category_title, - contents: contents, + title: category_title, + contents: contents, browse_endpoint_param: browse_endpoint_param, - continuation_token: continuation_token + continuation_token: continuation_token, }) end diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr new file mode 100644 index 00000000..6e16c879 --- /dev/null +++ b/src/invidious/helpers/extractors.cr @@ -0,0 +1,320 @@ +# This file contains helper methods to parse the Youtube API json data into +# neat little packages we can use + +# Tuple of Parsers/Extractors so we can easily cycle through them. +private ITEM_CONTAINER_EXTRACTOR = { + YoutubeTabsExtractor.new, + SearchResultsExtractor.new, + ContinuationExtractor.new +} + +private ITEM_PARSERS = { + VideoParser.new, + ChannelParser.new, + GridPlaylistParser.new, + PlaylistParser.new, +} + +private struct AuthorFallback + property name, id + + def initialize(@name : String? = nil, @id : String? = nil) + end +end + +# The following are the parsers for parsing raw item data into neatly packaged structs. +# They're accessed through the process() method which validates the given data as applicable +# to their specific struct and then use the internal parse() method to assemble the struct +# specific to their category. +private class ItemParser + # Base type for all item parsers. + def process(item : JSON::Any, author_fallback : AuthorFallback) + end + + private def parse(item_contents : JSON::Any, author_fallback : AuthorFallback) + end +end + +private class VideoParser < ItemParser + def process(item, author_fallback) + if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + video_id = item_contents["videoId"].as_s + title = item_contents["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" + + author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + + published = item_contents["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local + view_count = item_contents["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + length_seconds = item_contents["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || + item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? + .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 + + live_now = false + paid = false + premium = false + + premiere_timestamp = item_contents["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } + + item_contents["badges"]?.try &.as_a.each do |badge| + b = badge["metadataBadgeRenderer"] + case b["label"].as_s + when "LIVE NOW" + live_now = true + when "New", "4K", "CC" + # TODO + when "Premium" + paid = true + + # TODO: Potentially available as item_contents["topStandaloneBadge"]["metadataBadgeRenderer"] + premium = true + else nil # Ignore + end + end + + SearchVideo.new({ + title: title, + id: video_id, + author: author, + ucid: author_id, + published: published, + views: view_count, + description_html: description_html, + length_seconds: length_seconds, + live_now: live_now, + paid: paid, + premium: premium, + premiere_timestamp: premiere_timestamp, + }) + end +end + +private class ChannelParser < ItemParser + def process(item, author_fallback) + if item_contents = item["channelRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + author = item_contents["title"]["simpleText"]?.try &.as_s || author_fallback.name || "" + author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" + + author_thumbnail = item_contents["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" + subscriber_count = item_contents["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 + + auto_generated = false + auto_generated = true if !item_contents["videoCountText"]? + video_count = item_contents["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + + SearchChannel.new({ + author: author, + ucid: author_id, + author_thumbnail: author_thumbnail, + subscriber_count: subscriber_count, + video_count: video_count, + description_html: description_html, + auto_generated: auto_generated, + }) + end +end + +private class GridPlaylistParser < ItemParser + def process(item, author_fallback) + if item_contents = item["gridPlaylistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + title = item_contents["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + playlist_thumbnail = item_contents["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" + + SearchPlaylist.new({ + title: title, + id: plid, + author: author_fallback.name || "", + ucid: author_fallback.id || "", + video_count: video_count, + videos: [] of SearchPlaylistVideo, + thumbnail: playlist_thumbnail, + }) + end +end + +private class PlaylistParser < ItemParser + def process(item, author_fallback) + if item_contents = item["playlistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + def parse(item_contents, author_fallback) + title = item_contents["title"]["simpleText"]?.try &.as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCount"]?.try &.as_s.to_i || 0 + playlist_thumbnail = item_contents["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" + + author_info = item_contents["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + + videos = item_contents["videos"]?.try &.as_a.map do |v| + v = v["childVideoRenderer"] + v_title = v["title"]["simpleText"]?.try &.as_s || "" + v_id = v["videoId"]?.try &.as_s || "" + v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 + SearchPlaylistVideo.new({ + title: v_title, + id: v_id, + length_seconds: v_length_seconds, + }) + end || [] of SearchPlaylistVideo + + # TODO: item_contents["publishedTimeText"]? + + SearchPlaylist.new({ + title: title, + id: plid, + author: author, + ucid: author_id, + video_count: video_count, + videos: videos, + thumbnail: playlist_thumbnail, + }) + end +end + +# The following are the extractors for extracting an array of items from +# the internal Youtube API's JSON response. The result is then packaged into +# a structure we can more easily use via the parsers above. Their internals are +# identical to the item parsers. + +private class ItemsContainerExtractor + def process(item : Hash(String, JSON::Any)) + end + + private def extract(target : JSON::Any) + end +end + +private class YoutubeTabsExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["twoColumnBrowseResultsRenderer"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + selected_tab = extract_selected_tab(target["tabs"]) + content = selected_tab["tabRenderer"]["content"] + + content["sectionListRenderer"]["contents"].as_a.each do | renderer_container | + renderer_container = renderer_container["itemSectionRenderer"] + renderer_container_contents = renderer_container["contents"].as_a[0] + + # Shelf renderer usually refer to a category and would need special handling once + # An extractor for categories are added. But for now it is just used to + # extract items for the trending page + if items_container = renderer_container_contents["shelfRenderer"]? + if items_container["content"]["expandedShelfContentsRenderer"]? + items_container = items_container["content"]["expandedShelfContentsRenderer"] + end + elsif items_container = renderer_container_contents["gridRenderer"]? + else + items_container = renderer_container_contents + end + + items_container["items"].as_a.each do | item | + raw_items << item + end + end + + return raw_items + end +end + +private class SearchResultsExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["twoColumnSearchResultsRenderer"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + content = target["primaryContents"] + renderer = content["sectionListRenderer"]["contents"].as_a[0]["itemSectionRenderer"] + raw_items = renderer["contents"].as_a + + return raw_items + end +end + +private class ContinuationExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["continuationContents"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + if content = target["gridContinuation"]? + raw_items = content["items"].as_a + end + + return raw_items + end +end + +def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) + # Parses an item from Youtube's JSON response into a more usable structure. + # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. + author_fallback = AuthorFallback.new(author_fallback, author_id_fallback) + + # Cycles through all of the item parsers and attempt to parse the raw YT JSON data. + # Each parser automatically validates the data given to see if the data is + # applicable to itself. If not nil is returned and the next parser is attemped. + ITEM_PARSERS.each do |parser| + result = parser.process(item, author_fallback) + if !result.nil? + return result + end + end + # TODO radioRenderer, showRenderer, shelfRenderer, horizontalCardListRenderer, searchPyvRenderer +end + +def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) + items = [] of SearchItem + initial_data = initial_data["contents"]?.try &.as_h || initial_data["response"]?.try &.as_h || initial_data + + # This is identicial to the parser cyling of extract_item(). + ITEM_CONTAINER_EXTRACTOR.each do | extractor | + results = extractor.process(initial_data) + if !results.nil? + results.each do | item | + parsed_result = extract_item(item, author_fallback, author_id_fallback) + + if !parsed_result.nil? + items << parsed_result + end + end + end + end + + return items +end \ No newline at end of file diff --git a/src/invidious/helpers/helpers.cr b/src/invidious/helpers/helpers.cr index 7353f2d9..7c234f3c 100644 --- a/src/invidious/helpers/helpers.cr +++ b/src/invidious/helpers/helpers.cr @@ -251,169 +251,9 @@ def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : Str extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) end -def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) - if i = (item["videoRenderer"]? || item["gridVideoRenderer"]?) - video_id = i["videoId"].as_s - title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" - - author_info = i["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" - - published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local - view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 - description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - length_seconds = i["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || - i["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? - .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 - - live_now = false - paid = false - premium = false - - premiere_timestamp = i["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } - - i["badges"]?.try &.as_a.each do |badge| - b = badge["metadataBadgeRenderer"] - case b["label"].as_s - when "LIVE NOW" - live_now = true - when "New", "4K", "CC" - # TODO - when "Premium" - paid = true - - # TODO: Potentially available as i["topStandaloneBadge"]["metadataBadgeRenderer"] - premium = true - else nil # Ignore - end - end - - SearchVideo.new({ - title: title, - id: video_id, - author: author, - ucid: author_id, - published: published, - views: view_count, - description_html: description_html, - length_seconds: length_seconds, - live_now: live_now, - paid: paid, - premium: premium, - premiere_timestamp: premiere_timestamp, - }) - elsif i = item["channelRenderer"]? - author = i["title"]["simpleText"]?.try &.as_s || author_fallback || "" - author_id = i["channelId"]?.try &.as_s || author_id_fallback || "" - - author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" - subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 - - auto_generated = false - auto_generated = true if !i["videoCountText"]? - video_count = i["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - - SearchChannel.new({ - author: author, - ucid: author_id, - author_thumbnail: author_thumbnail, - subscriber_count: subscriber_count, - video_count: video_count, - description_html: description_html, - auto_generated: auto_generated, - }) - elsif i = item["gridPlaylistRenderer"]? - title = i["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" - plid = i["playlistId"]?.try &.as_s || "" - - video_count = i["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - playlist_thumbnail = i["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" - - SearchPlaylist.new({ - title: title, - id: plid, - author: author_fallback || "", - ucid: author_id_fallback || "", - video_count: video_count, - videos: [] of SearchPlaylistVideo, - thumbnail: playlist_thumbnail, - }) - elsif i = item["playlistRenderer"]? - title = i["title"]["simpleText"]?.try &.as_s || "" - plid = i["playlistId"]?.try &.as_s || "" - - video_count = i["videoCount"]?.try &.as_s.to_i || 0 - playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" - - author_info = i["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" - - videos = i["videos"]?.try &.as_a.map do |v| - v = v["childVideoRenderer"] - v_title = v["title"]["simpleText"]?.try &.as_s || "" - v_id = v["videoId"]?.try &.as_s || "" - v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 - SearchPlaylistVideo.new({ - title: v_title, - id: v_id, - length_seconds: v_length_seconds, - }) - end || [] of SearchPlaylistVideo - - # TODO: i["publishedTimeText"]? - - SearchPlaylist.new({ - title: title, - id: plid, - author: author, - ucid: author_id, - video_count: video_count, - videos: videos, - thumbnail: playlist_thumbnail, - }) - elsif i = item["radioRenderer"]? # Mix - # TODO - elsif i = item["showRenderer"]? # Show - # TODO - elsif i = item["shelfRenderer"]? - elsif i = item["horizontalCardListRenderer"]? - elsif i = item["searchPyvRenderer"]? # Ad - end -end - -def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) - items = [] of SearchItem - - channel_v2_response = initial_data - .try &.["continuationContents"]? - .try &.["gridContinuation"]? - .try &.["items"]? - - if channel_v2_response - channel_v2_response.try &.as_a.each { |item| - extract_item(item, author_fallback, author_id_fallback) - .try { |t| items << t } - } - else - initial_data.try { |t| t["contents"]? || t["response"]? } - .try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] || - t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] || - t["continuationContents"]? } - .try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? } - .try &.["contents"].as_a - .each { |c| c.try &.["itemSectionRenderer"]?.try &.["contents"].as_a - .try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a || - t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t } - .each { |item| - extract_item(item, author_fallback, author_id_fallback) - .try { |t| items << t } - } } - end - - items +def extract_selected_tab(tabs) + # Extract the selected tab from the array of tabs Youtube returns + return selected_target = tabs.as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0] end def check_enum(db, enum_name, struct_type = nil) diff --git a/src/invidious/helpers/macros.cr b/src/invidious/helpers/macros.cr index 04f89a22..f98a3e9f 100644 --- a/src/invidious/helpers/macros.cr +++ b/src/invidious/helpers/macros.cr @@ -48,7 +48,7 @@ module JSON::Serializable end end -macro templated(filename, template = "template", navbar_search = true, buffer_footer=false) +macro templated(filename, template = "template", navbar_search = true, buffer_footer = false) navbar_search = {{navbar_search}} buffer_footer = {{buffer_footer}}