masto_clear_url/ClearURLsCore.py

'''
* ClearURLs
* Copyright (c) 2019 Kevin Röbert
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''
from urllib.parse import unquote
from urllib.request import urlopen
import json
import re

'''
 * Python class that used the ClearURLs data.min.json
 * to clean urls. Also redirections included.
 *
 * @param dataURL The url to the data.min.json,
 * default is the gitlab repo of ClearURLs
'''
class ClearURLsCore:
    def __init__(self, dataURl
        = "https://gitlab.com/KevinRoebert/ClearUrls/raw/master/data/data.min.json"):
        self.dataURL = dataURl
        self.pages = {}
        self.initRules()

    def initRules(self):
        # download ClearURLs rule set
        #data = urlopen(self.dataURL).read()

        #load rule set from local file
        data = open("data.min.json").read()
        json_data = json.loads(data)

        # extract and expand rules
        for provider in json_data["providers"]:
            urlPattern = json_data["providers"][provider]["urlPattern"]
            self.pages[urlPattern] = {
                "rules": [],
                "exceptions": [],
                "redirections": []
            }

            for rule in json_data["providers"][provider]["rules"]:
                self.pages[urlPattern]["rules"].append("([\\/|\\?]|(&|&amp;))("+rule+"=[^\\/|\\?|&]*)")

            self.pages[urlPattern]["exceptions"] = json_data["providers"][provider]["exceptions"]
            self.pages[urlPattern]["redirections"] = json_data["providers"][provider]["redirections"]

    def clean(self, url):
        domain = re.sub("\\?.*", "", url)
        fields = "?"+re.sub(".*?\\?", "", url)

        for page in self.pages:
            if re.search(page, url):
                for exception in self.pages[page]["exceptions"]:
                    if re.search(exception, url):
                        return url
                for redirection in self.pages[page]["redirections"]:
                    if re.search(redirection, url):
                        result = re.search(redirection, url).group(1)
                        return unquote(result)
                for rule in self.pages[page]["rules"]:
                    fields = re.sub(rule, "", fields)
        finalFields = re.findall("[^\\/|\\?|&]+=[^\\/|\\?|&]+", fields)
        if len(finalFields) > 0:
            return domain + "?" + "&".join(finalFields);
        return domain
first commit 2022-03-26 17:03:26 +01:00			`'''`
			`* ClearURLs`
			`* Copyright (c) 2019 Kevin Röbert`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU Lesser General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`'''`
			`from urllib.parse import unquote`
			`from urllib.request import urlopen`
			`import json`
			`import re`

			`'''`
			`* Python class that used the ClearURLs data.min.json`
			`* to clean urls. Also redirections included.`
			`*`
			`* @param dataURL The url to the data.min.json,`
			`* default is the gitlab repo of ClearURLs`
			`'''`
			`class ClearURLsCore:`
			`def __init__(self, dataURl`
			`= "https://gitlab.com/KevinRoebert/ClearUrls/raw/master/data/data.min.json"):`
			`self.dataURL = dataURl`
			`self.pages = {}`
			`self.initRules()`

			`def initRules(self):`
			`# download ClearURLs rule set`
			`#data = urlopen(self.dataURL).read()`

			`#load rule set from local file`
			`data = open("data.min.json").read()`
			`json_data = json.loads(data)`

			`# extract and expand rules`
			`for provider in json_data["providers"]:`
			`urlPattern = json_data["providers"][provider]["urlPattern"]`
			`self.pages[urlPattern] = {`
			`"rules": [],`
			`"exceptions": [],`
			`"redirections": []`
			`}`

			`for rule in json_data["providers"][provider]["rules"]:`
			`self.pages[urlPattern]["rules"].append("([\\/\|\\?]\|(&\|&))("+rule+"=[^\\/\|\\?\|&]*)")`

			`self.pages[urlPattern]["exceptions"] = json_data["providers"][provider]["exceptions"]`
			`self.pages[urlPattern]["redirections"] = json_data["providers"][provider]["redirections"]`

			`def clean(self, url):`
			`domain = re.sub("\\?.*", "", url)`
			`fields = "?"+re.sub(".*?\\?", "", url)`

			`for page in self.pages:`
			`if re.search(page, url):`
			`for exception in self.pages[page]["exceptions"]:`
			`if re.search(exception, url):`
			`return url`
			`for redirection in self.pages[page]["redirections"]:`
			`if re.search(redirection, url):`
			`result = re.search(redirection, url).group(1)`
			`return unquote(result)`
			`for rule in self.pages[page]["rules"]:`
			`fields = re.sub(rule, "", fields)`
			`finalFields = re.findall("[^\\/\|\\?\|&]+=[^\\/\|\\?\|&]+", fields)`
			`if len(finalFields) > 0:`
			`return domain + "?" + "&".join(finalFields);`
			`return domain`