From d573461a671a0ccbf1b06c5e16cbc7c5cff52139 Mon Sep 17 00:00:00 2001
From: Omar Roth <omarroth@hotmail.com>
Date: Sat, 3 Mar 2018 15:59:21 -0600
Subject: [PATCH] Parse HTML properly instead of relying on regexes

---
 src/helpers.cr | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/helpers.cr b/src/helpers.cr
index 45204132..094d7dc1 100644
--- a/src/helpers.cr
+++ b/src/helpers.cr
@@ -132,8 +132,19 @@ def fetch_video(id, client)
   dislikes = dislikes ? dislikes.content.delete(",").to_i : 0
 
   description = html.xpath_node(%q(//p[@id="eow-description"]))
+  if description
+    description.xpath_nodes(%q(//a/@href)).each do |match|
+      uri = URI.parse(match.content)
+
+      if uri.host =~ /(www\.)?youtube.com/
+        uri = uri.full_path
+        puts uri
+      end
+
+      match.content = uri.to_s
+    end
+  end
   description = description ? description.to_xml : ""
-  description = description.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "")
 
   wilson_score = ci_lower_bound(likes, likes + dislikes)
 
@@ -278,6 +289,20 @@ def template_comments(root)
       author = child["data"]["author"]
       score = child["data"]["score"]
       body_html = HTML.unescape(child["data"]["body_html"].as_s)
+      body_html = XML.parse_html(body_html)
+
+      body_html.xpath_nodes(%q(//a/@href)).each do |match|
+        uri = URI.parse(match.content)
+
+        if uri.host =~ /(www\.)?youtube.com/
+          uri = uri.full_path
+          puts uri
+        end
+
+        match.content = uri.to_s
+      end
+
+      body_html = body_html.to_s
 
       replies_html = ""
       if child["data"]["replies"] != ""
@@ -317,8 +342,6 @@ def template_comments(root)
     end
   end
 
-  html = html.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "")
-
   return html
 end