Skip to content

Instantly share code, notes, and snippets.

@stain
Last active July 13, 2023 09:45
Show Gist options
  • Save stain/9bbc97c2388eaa178296a2afc4f5a277 to your computer and use it in GitHub Desktop.
Save stain/9bbc97c2388eaa178296a2afc4f5a277 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
__author__ = 'Eric Van Cleve, Stian Soiland-Reyes'
__copyright__ = 'Copyright 2019, Proofpoint Inc, 2023 The Universit of Manchester'
__license__ = 'GPL v.3'
__version__ = '3.1-rewriter'
__email__ = 'evancleve@proofpoint.com'
__status__ = 'Production'
## PROMINENT NOTICE:
# This script was modified 2023-07-13 by Stian Soiland-Reyes, The University of Manchester, UK
# - Added URLDefenseDecoder.v3_matcher pattern
# - Added option to work as stdin/stdout filter
## https://help.proofpoint.com/Threat_Insight_Dashboard/Concepts/How_do_I_decode_a_rewritten_URL%3F
# https://help.proofpoint.com/@api/deki/files/2775/urldecoder.py?revision=1
import sys
import re
import string
from argparse import ArgumentParser
from base64 import urlsafe_b64decode
if sys.version_info[0] < 3:
from urllib import unquote
import HTMLParser
htmlparser = HTMLParser.HTMLParser()
unescape = htmlparser.unescape
from string import maketrans
else:
from urllib.parse import unquote
from html import unescape
maketrans = str.maketrans
class URLDefenseDecoder(object):
@staticmethod
def __init__():
URLDefenseDecoder.ud_pattern = re.compile(r'https://urldefense(?:\.proofpoint)?\.com/(v[0-9])/')
URLDefenseDecoder.v1_pattern = re.compile(r'u=(?P<url>.+?)&k=')
URLDefenseDecoder.v2_pattern = re.compile(r'u=(?P<url>.+?)&[dc]=')
URLDefenseDecoder.v3_pattern = re.compile(r'v3/__(?P<url>.+?)__;(?P<enc_bytes>.*?)!')
# https://urldefense.com/v3/__https://github.com/taviso/wpunix__;!!PDiH4ENfjr2_Jw!HYwmxF7GOdHw2YRp2a5P487O_1_JBXGRNdYJ88g43_Nh_1ufYHN9SDNZ4TWZBUGn3Xs_ZlP_8mx7TMZkVzQ9bD8XrPUk1e_IGmM$[github[.]com]
# https://urldefense.com/v3/__https://groups.google.com/d/msgid/digital-curation/CAO7v-1Sq3zW7yi-opE19p*2BL3vf93F1Nk90cm*2BOOmCH2p*3DebPNg*40mail.gmail.com__;JSUlJQ!!PDiH4ENfjr2_Jw!HYwmxF7GOdHw2YRp2a5P487O_1_JBXGRNdYJ88g43_Nh_1ufYHN9SDNZ4TWZBUGn3Xs_ZlP_8mx7TMZkVzQ9bD8XrPUku5-LAcA$
# https://urldefense.com/v3/__http://example.com/test_under_score__;!!PDiH4ENfjr2_Jw!Fmw2wel8qlsqPIcK7EBGskW2DigGIcgLmh4pliChiN0CTWSb2yJI2Upn0BmciePHrfEvwEvmeHW1lKQcYN4squMBoIMnFT8$
# [groups[.]google[.]com]
URLDefenseDecoder.v3_matcher = re.compile(r'https://urldefense\.com/v3/__.[^;!$]+__;[^!]*![^!]*![^!]*![^$]*\$(?: ?\[(?:(?:\[\.\])?[^][]*)*\])?')
# https://urldefense.com/v3/__https://**A11.**C/2023/phd/evaluating-fdo/ro-crate-preview.html__;0LfQtdGO!!PDiH4ENfjr2_Jw!GneQqCVpfZSR4Zf45TG0LezPLaq_CI-gSfUAR1yTl6JU4DekDwQbbKnr9t_pFO-BfgA7BtmPRWfu_eCpZUcJatCRUsFRUCY$
URLDefenseDecoder.v3_token_pattern = re.compile(r"\*(\*.)?")
URLDefenseDecoder.v3_single_slash = re.compile(r"^([a-z0-9+.-]+:/)([^/].+)", re.IGNORECASE)
URLDefenseDecoder.v3_run_mapping = {}
run_values = string.ascii_uppercase + string.ascii_lowercase + string.digits + '-' + '_'
run_length = 2
for value in run_values:
URLDefenseDecoder.v3_run_mapping[value] = run_length
run_length += 1
def decode(self, rewritten_url):
match = self.ud_pattern.search(rewritten_url)
if match:
if match.group(1) == 'v1':
return self.decode_v1(rewritten_url)
elif match.group(1) == 'v2':
return self.decode_v2(rewritten_url)
elif match.group(1) == 'v3':
return self.decode_v3(rewritten_url)
else:
raise ValueError('Unrecognized version in: ', rewritten_url)
else:
raise ValueError('Does not appear to be a URL Defense URL')
def decode_v1(self, rewritten_url):
match = self.v1_pattern.search(rewritten_url)
if match:
url_encoded_url = match.group('url')
html_encoded_url = unquote(url_encoded_url)
url = unescape(html_encoded_url)
return url
else:
raise ValueError('Error parsing URL')
def decode_v2(self, rewritten_url):
match = self.v2_pattern.search(rewritten_url)
if match:
special_encoded_url = match.group('url')
trans = maketrans('-_', '%/')
url_encoded_url = special_encoded_url.translate(trans)
html_encoded_url = unquote(url_encoded_url)
url = unescape(html_encoded_url)
return url
else:
raise ValueError('Error parsing URL')
def decode_v3(self, rewritten_url):
def replace_token(token):
if token == '*':
character = self.dec_bytes[self.current_marker]
self.current_marker += 1
return character
if token.startswith('**'):
run_length = self.v3_run_mapping[token[-1]]
run = self.dec_bytes[self.current_marker:self.current_marker + run_length]
self.current_marker += run_length
return run
def substitute_tokens(text, start_pos=0):
match = self.v3_token_pattern.search(text, start_pos)
if match:
start = text[start_pos:match.start()]
built_string = start
token = text[match.start():match.end()]
built_string += replace_token(token)
built_string += substitute_tokens(text, match.end())
return built_string
else:
return text[start_pos:len(text)]
match = self.v3_pattern.search(rewritten_url)
if match:
url = match.group('url')
singleSlash = self.v3_single_slash.findall(url)
if singleSlash and len(singleSlash[0]) == 2:
url = singleSlash[0][0] + "/" + singleSlash[0][1]
encoded_url = unquote(url)
enc_bytes = match.group('enc_bytes')
enc_bytes += '=='
self.dec_bytes = (urlsafe_b64decode(enc_bytes)).decode('utf-8')
self.current_marker = 0
return substitute_tokens(encoded_url)
else:
raise ValueError('Error parsing URL')
def main():
parser = ArgumentParser(prog='URLDefenseDecode', description='Decode URLs rewritten by URL Defense. Supports v1, v2, and v3 URLs.')
parser.add_argument('rewritten_urls', nargs='*')
args = parser.parse_args()
urldefense_decoder = URLDefenseDecoder()
if args.rewritten_urls:
for rewritten_url in args.rewritten_urls:
try:
print(urldefense_decoder.decode(rewritten_url))
except ValueError as e:
print(e,file=sys.stderr)
else:
# Assume stdin/stdout filter
content = sys.stdin.read()
matches = urldefense_decoder.v3_matcher.findall(content)
for url_match in matches:
try:
real_url_match = urldefense_decoder.decode(url_match)
content = content.replace(url_match, real_url_match)
except ValueError as e:
print(e,file=sys.stderr)
sys.stdout.write(content)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment