first commit

This commit is contained in:
Swrup 2022-03-26 17:03:26 +01:00
commit 20f6dc6d0e
5 changed files with 2782 additions and 0 deletions

78
ClearURLsCore.py Executable file
View file

@ -0,0 +1,78 @@
'''
* ClearURLs
* Copyright (c) 2019 Kevin Röbert
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
from urllib.parse import unquote
from urllib.request import urlopen
import json
import re
'''
* Python class that used the ClearURLs data.min.json
* to clean urls. Also redirections included.
*
* @param dataURL The url to the data.min.json,
* default is the gitlab repo of ClearURLs
'''
class ClearURLsCore:
def __init__(self, dataURl
= "https://gitlab.com/KevinRoebert/ClearUrls/raw/master/data/data.min.json"):
self.dataURL = dataURl
self.pages = {}
self.initRules()
def initRules(self):
# download ClearURLs rule set
#data = urlopen(self.dataURL).read()
#load rule set from local file
data = open("data.min.json").read()
json_data = json.loads(data)
# extract and expand rules
for provider in json_data["providers"]:
urlPattern = json_data["providers"][provider]["urlPattern"]
self.pages[urlPattern] = {
"rules": [],
"exceptions": [],
"redirections": []
}
for rule in json_data["providers"][provider]["rules"]:
self.pages[urlPattern]["rules"].append("([\\/|\\?]|(&|&amp;))("+rule+"=[^\\/|\\?|&]*)")
self.pages[urlPattern]["exceptions"] = json_data["providers"][provider]["exceptions"]
self.pages[urlPattern]["redirections"] = json_data["providers"][provider]["redirections"]
def clean(self, url):
domain = re.sub("\\?.*", "", url)
fields = "?"+re.sub(".*?\\?", "", url)
for page in self.pages:
if re.search(page, url):
for exception in self.pages[page]["exceptions"]:
if re.search(exception, url):
return url
for redirection in self.pages[page]["redirections"]:
if re.search(redirection, url):
result = re.search(redirection, url).group(1)
return unquote(result)
for rule in self.pages[page]["rules"]:
fields = re.sub(rule, "", fields)
finalFields = re.findall("[^\\/|\\?|&]+=[^\\/|\\?|&]+", fields)
if len(finalFields) > 0:
return domain + "?" + "&".join(finalFields);
return domain

BIN
Mastodon_Trash.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 287 KiB

9
README.md Normal file
View file

@ -0,0 +1,9 @@
# masto-clear-url
A mastodon bot to raise awareness of tracking elements in URLs.
It uses [ClearURLs](https://gitlab.com/KevinRoebert/ClearUrls) script and rules.
You should be careful to not spam people without consent.
![a mastodon in a trash bin](Mastodon_Trash.png)

60
bot.py Executable file
View file

@ -0,0 +1,60 @@
"""
Mastodon bot to clean URLs
"""
from mastodon import Mastodon, StreamListener
from urllib.parse import urlparse
import validators
from bs4 import BeautifulSoup
import ClearURLsCore
import signal
mastodon = Mastodon(
#file with your account token
access_token='token.secret',
#instance url
api_base_url=''
)
class Listener(StreamListener):
def on_update(self, toot):
"""A new status has appeared! 'status' is the parsed JSON dictionary
describing the status."""
content = toot.content
soup = BeautifulSoup(content, 'html.parser')
cleaned_urls = []
unclean = False
for link in soup.find_all('a'):
url = link.get('href')
if validators.url(url):
clean_url = cleaner.clean(url)
if clean_url != url:
#if clean_url and url are almost the same size its porbably
#too spammy to toot about it
if abs(len(clean_url) - len(url)) < 6:
continue
cleaned_urls.append(clean_url)
unclean = True
#reply
if unclean:
idempotency_key = str(toot.id)
to_status = toot
plural = ""
if len(cleaned_urls) > 1:
plural = "s"
status = "The URL" + plural + " you posted contains trackers!\nI cleaned it for you:\n\r"
for clean_url in cleaned_urls:
status = status + clean_url + "\n"
mastodon.status_reply(to_status, status, in_reply_to_id=toot.id, media_ids=None, sensitive=False, visibility="public", spoiler_text=None, language="en", idempotency_key=idempotency_key, content_type=None, scheduled_at=None, poll=None, untag=True)
is_healthy = mastodon.stream_healthy()
listener = Listener()
cleaner = ClearURLsCore.ClearURLsCore()
handle = mastodon.stream_local(listener, run_async=True, timeout=300, \
reconnect_async=False, reconnect_async_wait_sec=5)
signal.pause()

2635
data.min.json Normal file

File diff suppressed because it is too large Load diff