Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save winzig/8894715 to your computer and use it in GitHub Desktop.
Save winzig/8894715 to your computer and use it in GitHub Desktop.

Revisions

  1. winzig revised this gist Jul 31, 2018. No changes.
  2. winzig revised this gist Jul 31, 2018. 1 changed file with 20 additions and 19 deletions.
    39 changes: 20 additions & 19 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,23 +1,21 @@
    # Single-line version:
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.(?:[a-z0-9]{2,13})\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z0-9]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))

    (?i)\b(https?:\/{1,3})?((?:(?:[\w.\-]+\.(?:[a-z]{2,13})|(?<=http:\/\/|https:\/\/)[\w.\-]+)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)(?:\w+(?:[.\-]+\w+)*\.(?:[a-z]{2,13})|(?:(?:[0-9](?!\d)|[1-9][0-9](?!\d)|1[0-9]{2}(?!\d)|2[0-4][0-9](?!\d)|25[0-5](?!\d))[.]?){4})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))*(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])?))

    # Commented multi-line version:

    (?xi)
    \b
    ( # Capture 1: entire matched URL
    (?:
    https?: # URL protocol and colon
    (https?:\/{1,3})? # Capture $1: (optional) URL scheme, colon, and slashes
    ( # Capture $2: Entire matched URL (other than optional protocol://)
    (?:
    (?:
    \/{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    [\w.\-]+\. # looks like domain name
    (?:[a-z]{2,13}) # ending in common popular gTLDs
    | #
    (?<=http:\/\/|https:\/\/)[\w.\-]+ # hostname preceded by http:// or https://
    )
    | # or
    [a-z0-9.\-]+\. # looks like domain name
    (?:[a-z0-9]{2,13}) # ending in common popular gTLDs (or final octet of IPv4 IP)
    \/ # followed by a slash
    \/ # followed by a slash
    )
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    @@ -36,11 +34,14 @@
    | # OR, the following to match naked domains:
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    \. # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    (?:[a-z0-9]{2,13}) # ending in common popular gTLDs (or final octet of IPv4 IP)
    (?:
    \w+
    (?:[.\-]+\w+)*
    \. # avoid matching the last two parts of an email domain like co.uk in [email protected]
    (?:[a-z]{2,13}) # ending in common popular gTLDs
    | # or
    (?:(?:[0-9](?!\d)|[1-9][0-9](?!\d)|1[0-9]{2}(?!\d)|2[0-4][0-9](?!\d)|25[0-5](?!\d))[.]?){4} # IPv4 address, as seen in https://stackoverflow.com/a/13166657/650558
    )
    \b
    \/?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
    @@ -50,13 +51,13 @@
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    )+
    )*
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'\".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    )?
    )
    )
  3. winzig revised this gist May 16, 2018. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.(?:[a-z]{2,13})\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.(?:[a-z0-9]{2,13})\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z0-9]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))

    # Commented multi-line version:

    @@ -16,7 +16,7 @@
    )
    | # or
    [a-z0-9.\-]+\. # looks like domain name
    (?:[a-z]{2,13}) # ending in common popular gTLDs
    (?:[a-z0-9]{2,13}) # ending in common popular gTLDs (or final octet of IPv4 IP)
    \/ # followed by a slash
    )
    (?: # One or more:
    @@ -40,7 +40,7 @@
    (?:[.\-][a-z0-9]+)*
    \. # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    (?:[a-z]{2,13})
    (?:[a-z0-9]{2,13}) # ending in common popular gTLDs (or final octet of IPv4 IP)
    \b
    \/?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
  4. winzig revised this gist May 16, 2018. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.\/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.(?:[a-z]{2,13})\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))

    # Commented multi-line version:

    @@ -15,10 +15,10 @@
    # (Trying not to match e.g. "URI::Escape")
    )
    | # or
    [a-z0-9.\-]+\. # looks like domain name followed by a slash:
    \/
    [a-z0-9.\-]+\. # looks like domain name
    (?:[a-z]{2,13}) # ending in common popular gTLDs
    \/ # followed by a slash
    )
    (?:[a-z]{2,13}) # Covers common popular gTLDs
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
  5. winzig revised this gist May 16, 2018. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.\/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])))
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.\/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])))

    # Commented multi-line version:

    @@ -31,7 +31,7 @@
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    [^\s`!()\[\]{};:'\".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    | # OR, the following to match naked domains:
    (?:
    @@ -56,7 +56,7 @@
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    [^\s`!()\[\]{};:'\".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    )
    )
  6. winzig revised this gist May 16, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -35,7 +35,7 @@
    )
    | # OR, the following to match naked domains:
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    \. # avoid matching the last two parts of an email domain
  7. winzig revised this gist May 16, 2018. No changes.
  8. winzig revised this gist May 16, 2018. 1 changed file with 41 additions and 27 deletions.
    68 changes: 41 additions & 27 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,48 +1,62 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))
    (?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+\.\/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b\/?(?!@)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])))

    # Commented multi-line version:

    (?xi)
    \b
    ( # Capture 1: entire matched URL
    ( # Capture 1: entire matched URL
    (?:
    https?: # URL protocol and colon
    https?: # URL protocol and colon
    (?:
    /{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    \/{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    )
    | # or
    [a-z0-9.\-]+[.] # looks like domain name followed by a slash:
    /
    | # or
    [a-z0-9.\-]+\. # looks like domain name followed by a slash:
    \/
    )
    (?:[a-z]{2,13}) # Covers common popular gTLDs
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    (?:[a-z]{2,13}) # Covers common popular gTLDs
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    )+
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    | # OR, the following to match naked domains:
    | # OR, the following to match naked domains:
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.] # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    \. # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
    \/?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    )+
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    )
    )
  9. winzig revised this gist May 3, 2018. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -37,8 +37,8 @@
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    [a-z0-9]+
    (?:\b[.\-][a-z0-9]+)*
    (?<![@.]) # avoid matching the last two parts of an email domain
    (?:[.\-][a-z0-9]+)*
    [.] # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    (?:[a-z]{2,13})
    \b
  10. winzig revised this gist May 3, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:\b[.\-][a-z0-9]+)*(?<![@.])(?:[a-z]{2,13})\b/?(?!@)))
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))

    # Commented multi-line version:

  11. winzig revised this gist May 3, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:\b[.\-][a-z0-9]+)*(?<![@.])(?:[a-z]{2,13})\b/?(?!@)))

    # Commented multi-line version:

  12. winzig revised this gist May 3, 2018. 1 changed file with 22 additions and 22 deletions.
    44 changes: 22 additions & 22 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -5,41 +5,41 @@

    (?xi)
    \b
    ( # Capture 1: entire matched URL
    ( # Capture 1: entire matched URL
    (?:
    https?: # URL protocol and colon
    https?: # URL protocol and colon
    (?:
    /{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    /{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    )
    | # or
    # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    | # or
    [a-z0-9.\-]+[.] # looks like domain name followed by a slash:
    /
    )
    (?:[a-z]{2,13})
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    (?:[a-z]{2,13}) # Covers common popular gTLDs
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    )+
    (?: # End with:
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    | # OR, the following to match naked domains:
    | # OR, the following to match naked domains:
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_(?<![@.])
    [a-z0-9]+
    (?:\b[.\-][a-z0-9]+)*
    [.]
    (?:\b[.\-][a-z0-9]+)*
    (?<![@.]) # avoid matching the last two parts of an email domain
    # like co.uk in [email protected]
    (?:[a-z]{2,13})
    \b
    /?
  13. winzig revised this gist May 3, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -38,7 +38,7 @@
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    (?:\b[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
  14. winzig revised this gist May 3, 2018. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -17,9 +17,9 @@
    | # or
    # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
    /
    )
    (?:[a-z]{2,13})
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
  15. winzig revised this gist May 3, 2018. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))

    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.]/)(?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))

    # Commented multi-line version:

  16. winzig revised this gist Feb 9, 2014. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions Liberal Regex Pattern for URLs
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:[a-z]{2,13})(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:[a-z]{2,13})\b/?(?!@)))


    # Commented multi-line version:
    @@ -18,7 +18,7 @@
    | # or
    # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
    (?:[a-z]{2,13})
    /
    )
    (?: # One or more:
    @@ -41,7 +41,7 @@
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
    (?:[a-z]{2,13})
    \b
    /?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
  17. @gruber gruber renamed this gist Feb 8, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  18. @gruber gruber created this gist Feb 8, 2014.
    49 changes: 49 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,49 @@
    # Single-line version:
    (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))


    # Commented multi-line version:

    (?xi)
    \b
    ( # Capture 1: entire matched URL
    (?:
    https?: # URL protocol and colon
    (?:
    /{1,3} # 1-3 slashes
    | # or
    [a-z0-9%] # Single letter or digit or '%'
    # (Trying not to match e.g. "URI::Escape")
    )
    | # or
    # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
    /
    )
    (?: # One or more:
    [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
    | # or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    )+
    (?: # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
    |
    \([^\s]+?\) # balanced parens, non-recursive: (…)
    | # or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
    )
    | # OR, the following to match naked domains:
    (?:
    (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
    \b
    /?
    (?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
    )
    )