first commit

2022-03-26 17:03:26 +01:00 · 2022-03-26 17:03:26 +01:00 · 20f6dc6d0e
commit 20f6dc6d0e
5 changed files with 2782 additions and 0 deletions
--- a/ClearURLsCore.py
+++ b/ClearURLsCore.py
@ -0,0 +1,78 @@
+'''
+* ClearURLs
+* Copyright (c) 2019 Kevin Röbert
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License
+* along with this program.  If not, see <http://www.gnu.org/licenses/>.
+'''
+from urllib.parse import unquote
+from urllib.request import urlopen
+import json
+import re
+
+'''
+ * Python class that used the ClearURLs data.min.json
+ * to clean urls. Also redirections included.
+ *
+ * @param dataURL The url to the data.min.json,
+ * default is the gitlab repo of ClearURLs
+'''
+class ClearURLsCore:
+    def __init__(self, dataURl
+        = "https://gitlab.com/KevinRoebert/ClearUrls/raw/master/data/data.min.json"):
+        self.dataURL = dataURl
+        self.pages = {}
+        self.initRules()
+
+    def initRules(self):
+        # download ClearURLs rule set
+        #data = urlopen(self.dataURL).read()
+
+        #load rule set from local file
+        data = open("data.min.json").read()
+        json_data = json.loads(data)
+
+        # extract and expand rules
+        for provider in json_data["providers"]:
+            urlPattern = json_data["providers"][provider]["urlPattern"]
+            self.pages[urlPattern] = {
+                "rules": [],
+                "exceptions": [],
+                "redirections": []
+            }
+
+            for rule in json_data["providers"][provider]["rules"]:
+                self.pages[urlPattern]["rules"].append("([\\/|\\?]|(&|&amp;))("+rule+"=[^\\/|\\?|&]*)")
+
+            self.pages[urlPattern]["exceptions"] = json_data["providers"][provider]["exceptions"]
+            self.pages[urlPattern]["redirections"] = json_data["providers"][provider]["redirections"]
+
+    def clean(self, url):
+        domain = re.sub("\\?.*", "", url)
+        fields = "?"+re.sub(".*?\\?", "", url)
+
+        for page in self.pages:
+            if re.search(page, url):
+                for exception in self.pages[page]["exceptions"]:
+                    if re.search(exception, url):
+                        return url
+                for redirection in self.pages[page]["redirections"]:
+                    if re.search(redirection, url):
+                        result = re.search(redirection, url).group(1)
+                        return unquote(result)
+                for rule in self.pages[page]["rules"]:
+                    fields = re.sub(rule, "", fields)
+        finalFields = re.findall("[^\\/|\\?|&]+=[^\\/|\\?|&]+", fields)
+        if len(finalFields) > 0:
+            return domain + "?" + "&".join(finalFields);
+        return domain
--- a/Mastodon_Trash.png
+++ b/Mastodon_Trash.png
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+# masto-clear-url
+
+A mastodon bot to raise awareness of tracking elements in URLs.
+
+It uses [ClearURLs](https://gitlab.com/KevinRoebert/ClearUrls) script and rules.
+
+You should be careful to not spam people without consent.
+
+![a mastodon in a trash bin](Mastodon_Trash.png)
--- a/bot.py
+++ b/bot.py
@ -0,0 +1,60 @@
+"""
+Mastodon bot to clean URLs
+"""
+from mastodon import Mastodon, StreamListener
+from urllib.parse import urlparse
+import validators
+from bs4 import BeautifulSoup
+import ClearURLsCore
+import signal
+
+mastodon = Mastodon(
+    #file with your account token
+    access_token='token.secret',
+    #instance url
+    api_base_url=''
+)
+
+class Listener(StreamListener):
+    def on_update(self, toot):
+        """A new status has appeared! 'status' is the parsed JSON dictionary
+        describing the status."""
+        content = toot.content
+        soup = BeautifulSoup(content, 'html.parser')
+        cleaned_urls = []
+        unclean = False
+        for link in soup.find_all('a'):
+            url = link.get('href')
+            if validators.url(url):
+                clean_url = cleaner.clean(url)
+                if clean_url != url:
+                    #if clean_url and url are almost the same size its porbably
+                    #too spammy to toot about it
+                    if abs(len(clean_url) - len(url)) < 6:
+                        continue
+                    cleaned_urls.append(clean_url)
+                    unclean = True
+    
+        #reply
+        if unclean:
+            idempotency_key = str(toot.id)
+            to_status = toot
+            plural = ""
+            if len(cleaned_urls) > 1:
+                plural = "s"
+    
+            status = "The URL" + plural + " you posted contains trackers!\nI cleaned it for you:\n\r"
+            for clean_url in cleaned_urls:
+                status = status + clean_url + "\n"
+    
+            mastodon.status_reply(to_status, status, in_reply_to_id=toot.id, media_ids=None, sensitive=False, visibility="public", spoiler_text=None, language="en", idempotency_key=idempotency_key, content_type=None, scheduled_at=None, poll=None, untag=True)
+
+is_healthy = mastodon.stream_healthy()
+
+listener = Listener()
+cleaner = ClearURLsCore.ClearURLsCore()
+
+handle = mastodon.stream_local(listener, run_async=True, timeout=300, \
+            reconnect_async=False, reconnect_async_wait_sec=5)
+
+signal.pause()
--- a/data.min.json
+++ b/data.min.json