first commit

This commit is contained in:
Swrup 2022-04-03 16:45:33 +02:00
commit 276bb92d08
10 changed files with 294 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
_build

42
.ocamlformat Normal file
View file

@ -0,0 +1,42 @@
version=0.20.1
assignment-operator=end-line
break-cases=fit
break-fun-decl=wrap
break-fun-sig=wrap
break-infix=wrap
break-infix-before-func=false
break-separators=before
break-sequences=true
cases-exp-indent=2
cases-matching-exp-indent=normal
doc-comments=before
doc-comments-padding=2
doc-comments-tag-only=default
dock-collection-brackets=false
exp-grouping=preserve
field-space=loose
if-then-else=compact
indicate-multiline-delimiters=space
indicate-nested-or-patterns=unsafe-no
infix-precedence=indent
leading-nested-match-parens=false
let-and=sparse
let-binding-spacing=compact
let-module=compact
margin=80
max-indent=68
module-item-spacing=sparse
ocp-indent-compat=false
parens-ite=false
parens-tuple=always
parse-docstrings=true
sequence-blank-line=preserve-one
sequence-style=terminator
single-case=compact
space-around-arrays=true
space-around-lists=true
space-around-records=true
space-around-variants=true
type-decl=sparse
wrap-comments=false
wrap-fun-args=true

8
LICENSE.md Normal file
View file

@ -0,0 +1,8 @@
The ISC License (ISC)
=====================
Copyright © 2022, TODO
Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

23
README.md Normal file
View file

@ -0,0 +1,23 @@
# clear_url
clear_url is an [OCaml] library to remove tracking elements from URLs.
It is based on [ClearURLs](https://gitlab.com/KevinRoebert/ClearUrls)
## Usage
```ocaml
let dirty_url = "https://example.com?utm_source=newsletter1&utm_medium=email&utm_campaign=sale"
let () = Format.printf (Clear_url.clean dirty_url)
```
## About
- [LICENSE]
[example]: ./example
[LICENSE]: ./LICENSE.md
[opam file]: ./clear_url.opam
[OCaml]: https://ocaml.org

31
clear_url.opam Normal file
View file

@ -0,0 +1,31 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
synopsis: "OCaml library to remove tracking in URLs"
description: "Use `Clear_url.clean` to remove tracking elements from URLs."
authors: ["swrup <swrup@protonmail.com>"]
license: "ISC"
tags: ["clear_url" "clean" "url" "tracker" "web3"]
homepage: "https://git.zapashcanon.fr/swrup/clear-url"
bug-reports: "https://git.zapashcanon.fr/swrup/ocaml-clear-url/issues"
depends: [
"dune" {>= "2.9"}
"ocaml" {>= "4.08"}
"odoc" {with-doc}
]
build: [
["dune" "subst"] {dev}
[
"dune"
"build"
"-p"
name
"-j"
jobs
"--promote-install-files=false"
"@install"
"@runtest" {with-test}
"@doc" {with-doc}
]
["dune" "install" "-p" name "--create-install-files" name]
]
dev-repo: "git+https://git.zapashcanon.fr/swrup/clear-url.git"

29
dune-project Normal file
View file

@ -0,0 +1,29 @@
(lang dune 2.9)
(implicit_transitive_deps false)
(name clear_url)
(license ISC)
(authors "swrup <swrup@protonmail.com>")
(source
(uri git+https://git.zapashcanon.fr/swrup/clear-url.git))
(homepage https://git.zapashcanon.fr/swrup/clear-url)
(bug_reports https://git.zapashcanon.fr/swrup/ocaml-clear-url/issues)
(generate_opam_files true)
(package
(name clear_url)
(synopsis "OCaml library to remove tracking in URLs")
(description
"Use `Clear_url.clean` to remove tracking elements from URLs.")
(tags
(clear_url clean url tracker web3))
(depends
(ocaml
(>= 4.08))))

131
src/clear_url.ml Normal file
View file

@ -0,0 +1,131 @@
open Yojson.Basic.Util
let file =
match Fpath.of_string "./src/rules.json" with
| Ok f -> f
| Error (`Msg s) -> failwith s
let parsed =
match Bos.OS.File.read file with
| Error (`Msg s) -> failwith s
| Ok s -> Yojson.Safe.to_basic (Yojson.Safe.from_string s)
type regexps =
{ urlpattern : Re.re
; rawrules : Re.re array
; rules : Re.re array
; exceptions : Re.re array
; redirections : Re.re array
}
let values = parsed |> member "providers" |> values |> Array.of_list
let regexps_array =
Array.map
(fun value ->
let urlpattern =
value |> member "urlPattern" |> to_string |> Re.Perl.re |> Re.compile
in
let get_array key =
value
|> (fun el ->
(*we can't use `filter_member "rules"` here because empty list
* where removed from the file ..*)
match member key el with
| `Null -> (*it was an empty list *) []
| json -> to_list json )
|> filter_string |> Array.of_list
in
let compile_array l =
l |> Array.map (fun s -> Re.compile @@ Re.Perl.re s)
in
(* referralMarketing should be in rules *)
let rules =
Array.append (get_array "rules") (get_array "referralMarketing")
|> Array.map (Format.sprintf "(?:&amp;|[/?#&])(?:%s=[^&]*)")
|> compile_array
in
let rawrules = get_array "rawRules" |> compile_array in
let exceptions = get_array "exceptions" |> compile_array in
let redirections = get_array "redirections" |> compile_array in
{ urlpattern; rawrules; rules; exceptions; redirections } )
values
let split_queries_regex = Re.compile (Re.Perl.re "[^\\/|\\?|&]+=[^\\/|\\?|&]+")
(* see https://docs.clearurls.xyz/1.23.0/specs/rules/ *)
let clean url =
(* filter by url pattern *)
let regexps_list =
Array.fold_left
(fun acc regexps ->
if Re.matches regexps.urlpattern url <> [] then regexps :: acc else acc
)
[] regexps_array
in
let is_exception =
List.exists
(fun t ->
Array.exists (fun except -> Re.matches except url <> []) t.exceptions )
regexps_list
in
if is_exception then url
else
(*TODO implement redirection, is it always percent-encoded ?? *)
(* apply rawrules to whole url *)
let url =
List.fold_left
(fun acc t ->
Array.fold_left
(fun acc rawrule -> Re.replace_string ~all:true rawrule ~by:"" acc)
acc t.rawrules )
url regexps_list
in
let uri = Uri.of_string url in
match Uri.verbatim_query uri with
| None -> url
| Some query ->
let query = "?" ^ query in
let fragment =
Option.fold ~none:"" ~some:(fun s -> "#" ^ s) (Uri.fragment uri)
in
(* let query = "?" ^ Re.replace_string get_query_regex ~by:"" url in *)
let base =
String.sub url 0
(String.length url - String.length query - String.length fragment)
in
(* apply rules to query *)
let apply_rules s =
List.fold_left
(fun s t ->
Array.fold_left
(fun acc rule -> Re.replace_string ~all:true rule ~by:"" acc)
s t.rules )
s regexps_list
in
let clean_query = apply_rules query in
(* TODO fragment might contains tracking fields as well (really?) *)
let query_list = Re.matches split_queries_regex clean_query in
let pp_field_list =
Format.pp_print_list
~pp_sep:(fun fmt () -> Format.pp_print_string fmt "&")
Format.pp_print_string
in
let pp_query fmt query_list =
match query_list with
| [] -> Format.fprintf fmt ""
| query_list -> Format.fprintf fmt "?%a" pp_field_list query_list
in
Format.asprintf "%s%a%s" base pp_query query_list fragment
(* TODO implement link unshortening like https://github.com/AmanoTeam/Unalix/blob/master/unalix/core/url_unshort.py *)

18
src/dune Normal file
View file

@ -0,0 +1,18 @@
(library
(name clear_url)
(modules clear_url)
(libraries bos fpath re uri yojson))
(executable
(name updaterules)
(modules updaterules)
(libraries lwt lwt.unix cohttp-lwt cohttp-lwt-unix uri yojson))
(rule
(target rules.json)
(deps updaterules.exe)
(mode promote)
(action
(with-stdout-to
rules.json
(run ./updaterules.exe))))

1
src/rules.json Normal file

File diff suppressed because one or more lines are too long

10
src/updaterules.ml Normal file
View file

@ -0,0 +1,10 @@
open Lwt.Syntax
let url = "https://rules2.clearurls.xyz/data.minify.json"
let program =
let* _, body = Cohttp_lwt_unix.Client.get (url |> Uri.of_string) in
let* body = Cohttp_lwt.Body.to_string body in
Lwt_io.write_line Lwt_io.stdout body
let () = Lwt_main.run program