first commit

2022-03-26 17:03:26 +01:00 · 2022-03-26 17:03:26 +01:00 · 20f6dc6d0e
commit 20f6dc6d0e
5 changed files with 2782 additions and 0 deletions
--- a/ClearURLsCore.py
+++ b/ClearURLsCore.py
@ -0,0 +1,78 @@
 '''
 * ClearURLs
 * Copyright (c) 2019 Kevin Röbert
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 '''
 from urllib.parse import unquote
 from urllib.request import urlopen
 import json
 import re
 '''
 * Python class that used the ClearURLs data.min.json
 * to clean urls. Also redirections included.
 *
 * @param dataURL The url to the data.min.json,
 * default is the gitlab repo of ClearURLs
 '''
 class ClearURLsCore:
    def __init__(self, dataURl
        = "https://gitlab.com/KevinRoebert/ClearUrls/raw/master/data/data.min.json"):
        self.dataURL = dataURl
        self.pages = {}
        self.initRules()
    def initRules(self):
        # download ClearURLs rule set
        #data = urlopen(self.dataURL).read()
        #load rule set from local file
        data = open("data.min.json").read()
        json_data = json.loads(data)
        # extract and expand rules
        for provider in json_data["providers"]:
            urlPattern = json_data["providers"][provider]["urlPattern"]
            self.pages[urlPattern] = {
                "rules": [],
                "exceptions": [],
                "redirections": []
            }
            for rule in json_data["providers"][provider]["rules"]:
                self.pages[urlPattern]["rules"].append("([\\/|\\?]|(&|&amp;))("+rule+"=[^\\/|\\?|&]*)")
            self.pages[urlPattern]["exceptions"] = json_data["providers"][provider]["exceptions"]
            self.pages[urlPattern]["redirections"] = json_data["providers"][provider]["redirections"]
    def clean(self, url):
        domain = re.sub("\\?.*", "", url)
        fields = "?"+re.sub(".*?\\?", "", url)
        for page in self.pages:
            if re.search(page, url):
                for exception in self.pages[page]["exceptions"]:
                    if re.search(exception, url):
                        return url
                for redirection in self.pages[page]["redirections"]:
                    if re.search(redirection, url):
                        result = re.search(redirection, url).group(1)
                        return unquote(result)
                for rule in self.pages[page]["rules"]:
                    fields = re.sub(rule, "", fields)
        finalFields = re.findall("[^\\/|\\?|&]+=[^\\/|\\?|&]+", fields)
        if len(finalFields) > 0:
            return domain + "?" + "&".join(finalFields);
        return domain
--- a/Mastodon_Trash.png
+++ b/Mastodon_Trash.png
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 # masto-clear-url
 A mastodon bot to raise awareness of tracking elements in URLs.
 It uses [ClearURLs](https://gitlab.com/KevinRoebert/ClearUrls) script and rules.
 You should be careful to not spam people without consent.
 ![a mastodon in a trash bin](Mastodon_Trash.png)
--- a/bot.py
+++ b/bot.py
@ -0,0 +1,60 @@
 """
 Mastodon bot to clean URLs
 """
 from mastodon import Mastodon, StreamListener
 from urllib.parse import urlparse
 import validators
 from bs4 import BeautifulSoup
 import ClearURLsCore
 import signal
 mastodon = Mastodon(
    #file with your account token
    access_token='token.secret',
    #instance url
    api_base_url=''
 )
 class Listener(StreamListener):
    def on_update(self, toot):
        """A new status has appeared! 'status' is the parsed JSON dictionary
        describing the status."""
        content = toot.content
        soup = BeautifulSoup(content, 'html.parser')
        cleaned_urls = []
        unclean = False
        for link in soup.find_all('a'):
            url = link.get('href')
            if validators.url(url):
                clean_url = cleaner.clean(url)
                if clean_url != url:
                    #if clean_url and url are almost the same size its porbably
                    #too spammy to toot about it
                    if abs(len(clean_url) - len(url)) < 6:
                        continue
                    cleaned_urls.append(clean_url)
                    unclean = True
        #reply
        if unclean:
            idempotency_key = str(toot.id)
            to_status = toot
            plural = ""
            if len(cleaned_urls) > 1:
                plural = "s"
            status = "The URL" + plural + " you posted contains trackers!\nI cleaned it for you:\n\r"
            for clean_url in cleaned_urls:
                status = status + clean_url + "\n"
            mastodon.status_reply(to_status, status, in_reply_to_id=toot.id, media_ids=None, sensitive=False, visibility="public", spoiler_text=None, language="en", idempotency_key=idempotency_key, content_type=None, scheduled_at=None, poll=None, untag=True)
 is_healthy = mastodon.stream_healthy()
 listener = Listener()
 cleaner = ClearURLsCore.ClearURLsCore()
 handle = mastodon.stream_local(listener, run_async=True, timeout=300, \
            reconnect_async=False, reconnect_async_wait_sec=5)
 signal.pause()
--- a/data.min.json
+++ b/data.min.json