{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regular Expressions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<re.Match object; span=(2, 5), match='aza'>\n",
      "<re.Match object; span=(1, 4), match='aza'>\n",
      "None\n",
      "<re.Match object; span=(0, 1), match='x'>\n",
      "<re.Match object; span=(4, 8), match='ping'>\n",
      "<re.Match object; span=(1, 5), match='pong'>\n",
      "<re.Match object; span=(0, 4), match='Pang'>\n",
      "<re.Match object; span=(0, 6), match='Python'>\n",
      "<re.Match object; span=(18, 22), match='hway'>\n",
      "None\n",
      "<re.Match object; span=(0, 6), match='cloudy'>\n",
      "<re.Match object; span=(0, 6), match='cloud9'>\n",
      "<re.Match object; span=(4, 5), match=' '>\n",
      "<re.Match object; span=(30, 31), match='.'>\n",
      "<re.Match object; span=(7, 10), match='cat'>\n",
      "<re.Match object; span=(7, 10), match='dog'>\n",
      "<re.Match object; span=(12, 15), match='dog'>\n",
      "<re.Match object; span=(7, 10), match='cat'>\n",
      "<re.Match object; span=(7, 10), match='dog'>\n",
      "<re.Match object; span=(12, 15), match='dog'>\n",
      "['dog', 'cat']\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "result1 = re.search(r\"aza\", \"plaza\")\n",
    "result2 = re.search(r\"aza\", \"bazaar\")\n",
    "print(result1)\n",
    "print(result2)\n",
    "print(re.search(r\"aza\", \"maze\"))\n",
    "print(re.search(r\"^x\", \"xenon\"))\n",
    "print(re.search(r\"p.ng\", \"clapping\"))\n",
    "print(re.search(r\"p.ng\", \"sponge\"))\n",
    "print(re.search(r\"p.ng\", \"Pangaea\", re.IGNORECASE))\n",
    "print(re.search(r\"[Pp]ython\", \"Python\"))\n",
    "print(re.search(r\"[a-z]way\", \"The end of the highway\"))\n",
    "print(re.search(r\"[a-z]way\", \"What a way to go\"))\n",
    "print(re.search(\"cloud[a-zA-Z0-9]\", \"cloudy\"))\n",
    "print(re.search(\"cloud[a-zA-Z0-9]\", \"cloud9\"))\n",
    "print(re.search(r\"[^a-zA-Z]\", \"This is a sentence with spaces.\"))\n",
    "print(re.search(r\"[^a-zA-Z ]\", \"This is a sentence with spaces.\"))\n",
    "print(re.search(r\"cat|dog\", \"I like cats.\"))\n",
    "print(re.search(r\"cat|dog\", \"I love dogs!\"))\n",
    "print(re.search(r\"cat|dog\", \"I like both dogs and cats.\"))\n",
    "print(re.search(r\"cat|dog\", \"I like cats.\"))\n",
    "print(re.search(r\"cat|dog\", \"I love dogs!\"))\n",
    "print(re.search(r\"cat|dog\", \"I like both dogs and cats.\"))\n",
    "print(re.findall(r\"cat|dog\", \"I like both dogs and cats.\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "False\n",
      "True\n",
      "True\n",
      "False\n",
      "True\n",
      "True\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "import re \n",
    "\n",
    "# This function checks if a given text contains any sequence that has 'a' followed by anything, then 'e' followed by anything, and ends with 'i'. Returns True or False accordingly.\n",
    "def check_aei (text):\n",
    "  result = re.search(r\"a.e.i\", text)\n",
    "  return result != None\n",
    "\n",
    "print(check_aei(\"academia\")) # This should return: True\n",
    "print(check_aei(\"aerial\")) # This should return: False\n",
    "print(check_aei(\"paramedic\")) # This should return: True\n",
    "\n",
    "# This function checks if a given text contains any punctuation marks (comma, period, colon, semicolon, question mark or exclamation point). Returns True or False accordingly.\n",
    "def check_punctuation (text):\n",
    "  result = re.search(r\"[,.:;?!]\", text)\n",
    "  return result != None\n",
    "\n",
    "print(check_punctuation(\"This is a sentence that ends with a period.\")) # This should return: True\n",
    "print(check_punctuation(\"This is a sentence fragment without a period\")) # This should return: False\n",
    "print(check_punctuation(\"Aren't regular expressions awesome?\")) # This should return: True\n",
    "print(check_punctuation(\"Wow! We're really picking up some steam now!\")) # This should return: True\n",
    "print(check_punctuation(\"End of the line\")) # This should return: False\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<re.Match object; span=(0, 9), match='Pygmalion'>\n",
      "<re.Match object; span=(0, 17), match='Python Programmin'>\n",
      "<re.Match object; span=(0, 6), match='Python'>\n",
      "<re.Match object; span=(0, 3), match='Pyn'>\n",
      "<re.Match object; span=(1, 3), match='ol'>\n",
      "<re.Match object; span=(1, 5), match='ooll'>\n",
      "None\n",
      "<re.Match object; span=(3, 7), match='each'>\n",
      "<re.Match object; span=(7, 12), match='peach'>\n"
     ]
    }
   ],
   "source": [
    "import re \n",
    "print(re.search(r\"Py.*n\", \"Pygmalion\"))\n",
    "# re.search() function returns a match object if it finds the pattern 'Py followed by any characters ending in n' within the\n",
    "# string \"Pygmalion\". The '.' means any character (except newline), and '*' means zero or more repetitions of the preceding\n",
    "# RE. Here, the RE is '.' which can mean anything. So this search looks for occurrences of the letter 'P' followed by one\n",
    "# or more characters, then 'n'. In \"Pygmalion\", it finds and returns a match object for \"Pygmalion\". \n",
    "print(re.search(r\"Py.*n\", \"Python Programming\"))\n",
    "# Here we are searching for the pattern in the string Python Programming which also includes 'P' at start followed by any\n",
    "# characters ending with 'n'.  So it returns a match object for \"Python\" from the word \"Python Programming\".\n",
    "print(re.search(r\"Py[a-z]*n\", \"Python Programming\"))\n",
    "# Here we are using a character set '[a-z]' which means any lowercase letter. So it allows 'P' followed by zero or more\n",
    "# lowercase letters ending with 'n'. In this case, the match object is for \"Python\" from the word Python Programming as all\n",
    "# characters in between P and n are lowercase.\n",
    "print(re.search(r\"Py[a-z]*n\", \"Pyn\"))\n",
    "# Here we search a pattern where 'P' followed by zero or more lowercase letters ending with 'n' in the string 'Pyn' itself. \n",
    "# It returns match object for whole string Pyn as it satisfies our RE conditions.\n",
    "print(re.search(r\"o+l+\", \"goldfish\"))\n",
    "# Here we are looking for one or more 'o' followed by one or more 'l'. In the word goldfish, there is no 'o' followed by 'l'\n",
    "# so it returns None.\n",
    "print(re.search(r\"o+l+\", \"woolly\"))\n",
    "# Here we are looking for one or more 'o' followed by one or more 'l'. In the word wooly there is 'oo' and both l's, so it\n",
    "# returns match object for whole word wooly.\n",
    "print(re.search(r\"o+l+\", \"boil\"))\n",
    "# Here we are looking for one or more 'o' followed by one or more 'l'. In the word boil, there is only one o and two l’s.\n",
    "# So it returns match object for whole word boil. \n",
    "print(re.search(r\"p?each\", \"To each their own\"))\n",
    "# Here we are looking for an optional 'p' followed by 'each' as 'p' can occur zero or one time. In the string \n",
    "# \"To each their own\", it returns None because there is no 'p' before 'each'. \n",
    "print(re.search(r\"p?each\", \"I like peaches\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The repeating_letter_a function checks if the text passed includes the letter \"a\" (lowercase or uppercase) at least twice.\n",
    "# For example, repeating_letter_a(\"banana\") is True, while repeating_letter_a(\"pineapple\") is False.# Fill in the code to\n",
    "# make this work. \n",
    "import re\n",
    "def repeating_letter_a(text):\n",
    "  result = re.search(r\"[Aa].*[Aa]\", text)\n",
    "  return result != None\n",
    "\n",
    "print(repeating_letter_a(\"banana\")) # True\n",
    "print(repeating_letter_a(\"pineapple\")) # False\n",
    "print(repeating_letter_a(\"Animal Kingdom\")) # True\n",
    "print(repeating_letter_a(\"A is for apple\")) # True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "# This line of code uses regular expressions (regex) to search for any occurrence of '.com' in the word \"welcome\". \n",
    "print(re.search(r\".com\", \"welcome\")) # <re.Match object; span=(2, 6), match='lcom'>\n",
    "# This line of code uses regular expressions (regex) to search for any occurrence of '\\.com' in the word \"welcome\". \n",
    "# The output will be None because there is no '.com' within the string 'welcome'. \n",
    "# The backslash before '.' in the regex pattern escapes the period, making it match a literal period character rather than any character (as the period itself does in regex).\n",
    "print(re.search(r\"\\.com\", \"welcome\")) # None\n",
    "# This line of code uses regular expressions (regex) to search for any occurrence of '.com' in the word \"mydomain.com\". \n",
    "print(re.search(r\"\\.com\", \"mydomain.com\")) # <re.Match object; span=(8, 12), match='.com'>\n",
    "# In this \\w matches any alphanumeric character and '*' denotes zero or more repetitions in the word \"This is an example\". \n",
    "print(re.search(r\"\\w*\", \"This is an example\")) # <re.Match object; span=(0, 4), match='This'>\n",
    "# This line of code uses regular expressions (regex) to search for any occurrence of '\\w*' in the word \"And_this_is_another\". \n",
    "print(re.search(r\"\\w*\", \"And_this_is_another\")) # <re.Match object; span=(0, 19), match='And_this_is_another'>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fill in the code to check if the text passed has at least 2 groups of alphanumeric characters\n",
    "# (including letters, numbers, and underscores) separated by one or more whitespace characters.\n",
    "import re\n",
    "def check_character_groups(text):\n",
    "  result = re.search(r\"\\w\\s\\w\", text)\n",
    "  return result != None\n",
    "\n",
    "print(check_character_groups(\"One\")) # False\n",
    "print(check_character_groups(\"123  Ready Set GO\")) # True\n",
    "print(check_character_groups(\"username user_01\")) # True\n",
    "print(check_character_groups(\"shopping_list: milk, bread, eggs.\")) # False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re  # import Python's regular expressions module\n",
    "# This searches for a pattern in 'Argentina' that starts with an \"A\" and ends with an \"a\". The .* is a wildcard that can match any character (except newline) between A and a. It returns the matching object span.\n",
    "print(re.search(r\"A.*a\", \"Argentina\"))  # <_sre.SRE_Match object; span=(0, 9), match='Argentina'>\n",
    "# Same as previous search but in 'Azerbaijan'. It also returns the matching object span.\n",
    "print(re.search(r\"A.*a\", \"Azerbaijan\"))  # <_sre.SRE_Match object; span=(0, 9), match='Azerbaija'>\n",
    "# This checks if the entire string in 'Australia' starts with an \"A\" and ends with an \"a\". The ^ signifies start of a line and $ signifies end of a line. It returns None because Australia doesn't have a full stop at the end.\n",
    "print(re.search(r\"^A.*a$\", \"Australia\"))  # <re.Match object; span=(0, 9), match='Australia'>\n",
    "# This is a pattern that matches a string if it starts with alphanumeric character (including underscore) and then followed by any number of alphanumeric characters or underscores. \n",
    "pattern = r\"^[a-zA-Z_][a-zA-Z0-9_]*$\"  # valid variable pattern in Python according to the standard conventions\n",
    "# This searches if \"_this_is_a_valid_variable_name\" matches the pattern. It returns the matching object span because it does match the pattern.\n",
    "print(re.search(pattern, \"_this_is_a_valid_variable_name\"))  # <_sre.SRE_Match object; span=(0, 28), match='_this_is_a_valid_variable_name'>\n",
    "# This searches if \"this isn't a valid variable\" matches the pattern. It returns None because it contains space which is not allowed in Python variables according to standard conventions.\n",
    "print(re.search(pattern, \"this isn't a valid variable\"))  # None\n",
    "# This searches if \"my_variable1\" matches the pattern. It returns matching object span as it does match the pattern.\n",
    "print(re.search(pattern, \"my_variable1\")) # <_sre.SRE_Match object; span=(0, 12), match='my_variable1'>\n",
    "# This searches if \"2my_variable1\" matches the pattern. It returns None because it starts with a digit which is not allowed in Python variables according to standard conventions.\n",
    "print(re.search(pattern, \"2my_variable1\")) # None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fill in the code to check if the text passed looks like a standard sentence, meaning that it starts with an\n",
    "# uppercase letter, followed by at least some lowercase letters or a space, and ends with a period, question\n",
    "# mark, or exclamation point. \n",
    "import re\n",
    "def check_sentence(text):\n",
    "  result = re.search(r\"^[A-Z][a-z\\s].*[\\.?!]$\", text)\n",
    "  return result != None\n",
    "\n",
    "print(check_sentence(\"Is this is a sentence?\")) # True\n",
    "print(check_sentence(\"is this is a sentence?\")) # False\n",
    "print(check_sentence(\"Hello\")) # False\n",
    "print(check_sentence(\"1-2-3-GO!\")) # False\n",
    "print(check_sentence(\"A star is born.\")) # True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "r”\\d{3}-\\d{3}-\\d{4}”  This line of code matches U.S. phone numbers in the format 111-222-3333.\n",
    "r”^-?\\d*(\\.\\d+)?$”  This line of code matches any positive or negative number, with or without decimal places.\n",
    "r”^(.+)\\/([^\\/]+)\\/” This line of code matches any path and filename."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#The check_web_address() function checks if the text passed qualifies as a top-level web address,\n",
    "# meaning that it contains alphanumeric characters (which includes letters, numbers, and underscores),\n",
    "# as well as periods, dashes, and a plus sign, followed by a period and a character-only top-level\n",
    "# domain such as \".com\", \".info\", \".edu\", etc. Fill in the regular expression to do that, using escape\n",
    "# characters, wildcards, repetition qualifiers, beginning and end-of-line characters, and character classes.\n",
    "import re\n",
    "def check_web_address(text):\n",
    "  pattern = r\"^[A-Za-z0-9_.-]*\\.[A-Za-z]+$\"\n",
    "  result = re.search(pattern, text)\n",
    "  return result != None\n",
    "\n",
    "print(check_web_address(\"gmail.com\")) # True\n",
    "print(check_web_address(\"www@google\")) # False\n",
    "print(check_web_address(\"www.Coursera.org\")) # True\n",
    "print(check_web_address(\"web-address.com/homepage\")) # False\n",
    "print(check_web_address(\"My_Favorite-Blog.US\")) # True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def check_time(text):\n",
    "  pattern = r\"[0-9][:][0-5][0-9]\\s?(am|AM|pm|PM)\"\n",
    "  result = re.search(pattern, text)\n",
    "  return result != None\n",
    "\n",
    "print(check_time(\"12:45pm\")) # True\n",
    "print(check_time(\"9:59 AM\")) # True\n",
    "print(check_time(\"6:60am\")) # False\n",
    "print(check_time(\"five o'clock\")) # False\n",
    "print(check_time(\"6:02 am\")) # True\n",
    "print(check_time(\"6:02km\")) # False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def contains_acronym(text):\n",
    "  pattern = r\".*\\([A-Za-z0-9]+\\).*\"\n",
    "  result = re.search(pattern, text)\n",
    "  return result != None\n",
    "\n",
    "print(contains_acronym(\"Instant messaging (IM) is a set of communication technologies used for text-based communication\")) # True\n",
    "print(contains_acronym(\"American Standard Code for Information Interchange (ASCII) is a character encoding standard for electronic communication\")) # True\n",
    "print(contains_acronym(\"Please do NOT enter without permission!\")) # False\n",
    "print(contains_acronym(\"PostScript is a fourth-generation programming language (4GL)\")) # True\n",
    "print(contains_acronym(\"Have fun using a self-contained underwater breathing apparatus (Scuba)!\")) # True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def check_zip_code (text):\n",
    " result = re.search(r\"^.*\\s(\\d{5})(-\\d{4})?.*$\", text)\n",
    " return result != None\n",
    "\n",
    "print(check_zip_code(\"The zip codes for New York are 10001 thru 11104.\")) # True\n",
    "print(check_zip_code(\"90210 is a TV show\")) # False\n",
    "print(check_zip_code(\"Their address is: 123 Main Street, Anytown, AZ 85258-0001.\")) # True\n",
    "print(check_zip_code(\"The Parliament of Canada is at 111 Wellington St, Ottawa, ON K1A0A9.\")) # False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Advanced Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "# The regex is searching for a string that starts with a word (group 1) followed by a comma and space (group 2)\n",
    "# then ends with another word\n",
    "result = re.search(r\"^(\\w*), (\\w*)$\", \"Lovelace, Ada\")\n",
    "print(result)\n",
    "print(result.groups())\n",
    "print(result[0])\n",
    "print(result[1])\n",
    "print(result[2])\n",
    "\"{} {}\".format(result[2], result[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lovelace Ada\n",
      "Ada Lovelace\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "def rearrange_name(name):\n",
    "    result = re.search(r\"^(\\w*), (\\w*)$\", name)\n",
    "    if result is None:\n",
    "        return name\n",
    "    return \"{} {}\".format(result[1], result[2])\n",
    "def rearrange_surname(name):\n",
    "    result = re.search(r\"^(\\w*), (\\w*)$\", name)\n",
    "    if result is None:\n",
    "        return name\n",
    "    return \"{} {}\".format(result[2], result[1])\n",
    "print(rearrange_name(\"Lovelace, Ada\"))\n",
    "print(rearrange_surname(\"Lovelace, Ada\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fix the regular expression used in the rearrange_name function so that it can match middle names, middle initials, as well as double surnames.\n",
    "import re\n",
    "def rearrange_name(name):\n",
    "#   result = re.search(r\"^(\\w*), (\\w*)$\", name) # from this to:\n",
    "  result = re.search(r\"^(\\w.*\\w*), (\\w.*\\w*)$\", name)\n",
    "  if result == None:\n",
    "    return name\n",
    "  return \"{} {}\".format(result[2], result[1])\n",
    "\n",
    "name=rearrange_name(\"Kennedy, John F.\")\n",
    "print(name)\n",
    "\n",
    "# Output:\n",
    "# John F. Kennedy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Repetition Qualifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "print(re.search(r\"[a-zA-Z]{5}\", \"a ghost\")) # This line searches for any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a ghost\". It won't find a match because there is no such sequence, so it returns None.\n",
    "print(re.search(r\"[a-zA-Z]{5}\", \"a scary ghost appeared\")) # This line searches for any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a scary ghost appeared\". It will find a match for 'scary' and return it as a Match object.\n",
    "print(re.findall(r\"[a-zA-Z]{5}\", \"a scary ghost appeared\")) # This line finds all (non-overlapping) occurrences of any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a scary ghost appeared\". It will find matches for 'scary' and 'ghost', returning them as a list of strings ['scary', 'ghost'].\n",
    "re.findall(r\"\\b[a-zA-Z]{5}\\b\", \"A scary ghost appeared\") # This line finds all (non-overlapping) occurrences of any alphabetic character (a-z or A-Z) sequence of length 5 that are separate words in the string \"A scary ghost appeared\". It won't find a match for 'scary' and 'ghost' because they aren't standalone words, so it returns an empty list.\n",
    "print(re.findall(r\"\\w{5,10}\", \"I really like strawberries\")) # This line finds all (non-overlapping) occurrences of a word composed of alphanumeric characters that is between 5 and 10 characters long in the string \"I really like strawberries\". It will find matches for 'really' and 'strawberries', returning them as a list of strings ['really', 'strawberries'].\n",
    "print(re.findall(r\"\\w{5,}\", \"I really like strawberries\")) # This line finds all (non-overlapping) occurrences of a word composed of alphanumeric characters that is at least 5 characters long in the string \"I really like strawberries\". It will find matches for 'really' and 'strawberries', returning them as a list of strings ['really', 'strawberries'].\n",
    "print(re.search(r\"s\\w{,20}\", \"I really like strawberries\")) # This line searches for any word that starts with the letter s followed by less than or equal to 20 alphanumeric characters in the string \"I really like strawberries\". It will find a match for 'strawberries' because it starts with 's' and is followed by fewer than or equal to 20 alphanumeric characters, returning it as a list of strings ['strawberries'].\n",
    "\n",
    "# Output:\n",
    "# <re.Match object; span=(2, 7), match='ghost'>\n",
    "# <re.Match object; span=(2, 7), match='scary'>\n",
    "# ['scary', 'ghost', 'appea']\n",
    "# ['really', 'strawberri']\n",
    "# ['really', 'strawberries']\n",
    "# <re.Match object; span=(14, 26), match='strawberries'>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extracting a PID using regexes in Python\n",
    "import re\n",
    "log = \"July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade\"\n",
    "regex = r\"\\[(\\d+)\\]\"\n",
    "result = re.search(regex, log)\n",
    "# This will match any string with a number enclosed in square brackets like [12345] or [67890] etc. \n",
    "# But if the string does not have such a pattern, it returns None. \n",
    "# We'll use this function later to extract PID from our log strings.\n",
    "result = re.search(regex, \"A completely different string that also has numbers [34567]\")\n",
    "result = re.search(regex, \"99 elephants in a [cage]\")\n",
    "def extract_pid(log_line):\n",
    "    # This function takes as input a log line (string) and returns the number within square brackets if such a pattern exists in the string\n",
    "    regex = r\"\\[(\\d+)\\]\"\n",
    "    result = re.search(regex, log_line)\n",
    "    if result is None: \n",
    "        return \"No PID found\" # If no match found, return this message.\n",
    "    return result[1] # The result[1] will be the number within square brackets in string format.  \n",
    "print(extract_pid(log))\n",
    "# It prints '12345' as there is a pattern [12345] in the given log line\n",
    "print(extract_pid(\"99 elephants in a [cage]\")) \n",
    "# It prints 'No PID found' as there are no numbers within square brackets in this string.\n",
    "# Output:\n",
    "# 12345"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def extract_pid(log_line):\n",
    "    regex = r\"\\[(\\d+)\\]: (\\w+)\"   # Modify regex to return uppercase message in parenthesis after process ID\n",
    "    result = re.search(regex, log_line)\n",
    "    if result is None:\n",
    "        return None\n",
    "    pid = result.groups()[0]  # Fetch the first group (process id)\n",
    "    message = result.groups()[1] # fetch the second group (uppercase message)\n",
    "    return \"{} ({})\".format(pid, message) \n",
    "\n",
    "print(extract_pid(\"July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade\")) # 12345 (ERROR)\n",
    "print(extract_pid(\"99 elephants in a [cage]\")) # None\n",
    "print(extract_pid(\"A string that also has numbers [34567] but no uppercase message\")) # None\n",
    "print(extract_pid(\"July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup\")) # 67890 (RUNNING)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Splitting and replacing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "print(re.split(r\"[.?!]\", \"One sentence. Another one? And the last one!\"))\n",
    "print(re.split(r\"([.?!])\", \"One sentence. Another one? And the last one!\"))\n",
    "print(re.sub(r\"[\\w.%+-]+@[\\w.-]+\", \"[REDACTED]\", \"Received an email for go_nuts95@my.example.com\"))\n",
    "print(re.sub(r\"^([\\w .-]*), ([\\w .-]*)$\", r\"\\2 \\1\", \"Lovelace, Ada\"))\n",
    "\n",
    "# Output:\n",
    "# ['One sentence', ' Another one', ' And the last one', '']\n",
    "# ['One sentence', '.', ' Another one', '?', ' And the last one', '!', '']\n",
    "# Received an email for [REDACTED]\n",
    "# Ada Lovelace"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sabrina Green,+1-802-867-5309,System Administrator\n",
      "Eli Jones,+1-684-3481127,IT specialist\n",
      "Melody Daniels,+1-846-687-7436,Programmer\n",
      "Charlie Rivera,+1-698-746-3357,Web Developer\n"
     ]
    }
   ],
   "source": [
    "# Question 1    X -> V\n",
    "# You’re working with a CSV file that contains employee information. Each record has a name field, followed by a phone number field, and a role field. The phone number field contains U.S. phone numbers and needs to be modified to the international format, with +1- in front of the phone number. The rest of the phone number should not change. Fill in the regular expression, using groups, to use the transform_record() function to do that.\n",
    "\n",
    "import re\n",
    "def transform_record(record):\n",
    "  new_record = re.sub(r'(\\w+ \\w+),(\\d{3}-\\d{7}|\\d{3}-\\d{3}-\\d{4}),(\\w+\\s*\\w+)', r'\\1,+1-\\2,\\3', record)\n",
    "  return new_record\n",
    "\n",
    "print(transform_record(\"Sabrina Green,802-867-5309,System Administrator\")) # Sabrina Green,+1-802-867-5309,System Administrator\n",
    "print(transform_record(\"Eli Jones,684-3481127,IT specialist\"))  # Eli Jones,+1-684-3481127,IT specialist\n",
    "print(transform_record(\"Melody Daniels,846-687-7436,Programmer\")) # Melody Daniels,+1-846-687-7436,Programmer\n",
    "print(transform_record(\"Charlie Rivera,698-746-3357,Web Developer\")) # Charlie Rivera,+1-698-746-3357,Web Developer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Question 2    V\n",
    "# The multi_vowel_words() function returns all words with 3 or more consecutive vowels (a, e, i, o, u). Fill in the regular expression to do that.\n",
    "import re\n",
    "def multi_vowel_words(text):\n",
    "  pattern = r'\\b\\w*[aeiou]{3}\\w*\\b'\n",
    "  result = re.findall(pattern, text)\n",
    "  return result\n",
    "\n",
    "print(multi_vowel_words(\"Life is beautiful\")) # ['beautiful']\n",
    "print(multi_vowel_words(\"Obviously, the queen is courageous and gracious.\")) # ['Obviously', 'queen', 'courageous', 'gracious']\n",
    "print(multi_vowel_words(\"The rambunctious children had to sit quietly and await their delicious dinner.\")) # ['rambunctious', 'quietly', 'delicious']\n",
    "print(multi_vowel_words(\"The order of a data queue is First In First Out (FIFO)\")) # ['queue']\n",
    "print(multi_vowel_words(\"Hello world!\")) # []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "// Start of program\n",
      "  number = 0   // Initialize the variable\n",
      "  number += 1   // Increment the variable\n",
      "  return(number)\n"
     ]
    }
   ],
   "source": [
    "# Question 4    X -> V\n",
    "# The transform_comments() function converts comments in a Python script into those usable by a C compiler. This means looking for text that begins with a hash mark (#) and replacing it with double slashes (//), which is the C single-line comment indicator. For the purpose of this exercise, we'll ignore the possibility of a hash mark embedded inside of a Python command, and assume that it's only used to indicate a comment. We also want to treat repetitive hash marks (##), (###), etc., as a single comment indicator, to be replaced with just (//) and not (#//) or (//#). Fill in the parameters of the substitution method to complete this function: \n",
    "import re\n",
    "def transform_comments(line_of_code):\n",
    "  result = re.sub('#+', '//', line_of_code)\n",
    "  return result\n",
    "\n",
    "print(transform_comments(\"### Start of program\")) # Should be \"// Start of program\"\n",
    "print(transform_comments(\"  number = 0   ## Initialize the variable\")) # Should be \"  number = 0   // Initialize the variable\"\n",
    "print(transform_comments(\"  number += 1   # Increment the variable\")) # Should be \"  number += 1   // Increment the variable\"\n",
    "print(transform_comments(\"  return(number)\")) # Should be \"  return(number)\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "My number is (212) 345-9999.\n",
      "Please call (888) 555-1234\n",
      "123-123-12345\n",
      "Phone number of Buckingham Palace is +44 303 123 7300\n"
     ]
    }
   ],
   "source": [
    "# Question 5\n",
    "# The convert_phone_number() function checks for a U.S. phone number format: XXX-XXX-XXXX (3 digits followed by a dash, 3 more digits followed by a dash, and 4 digits), and converts it to a more formal format that looks like this: (XXX) XXX-XXXX. Fill in the regular expression to complete this function.\n",
    "import re\n",
    "def convert_phone_number(phone):\n",
    "  result = re.sub(r\"\\b(\\d{3})-(\\d{3})-(\\d{4})\\b\", r\"(\\1) \\2-\\3\", phone)\n",
    "  return result\n",
    "\n",
    "print(convert_phone_number(\"My number is 212-345-9999.\")) # My number is (212) 345-9999.\n",
    "print(convert_phone_number(\"Please call 888-555-1234\")) # Please call (888) 555-1234\n",
    "print(convert_phone_number(\"123-123-12345\")) # 123-123-12345\n",
    "print(convert_phone_number(\"Phone number of Buckingham Palace is +44 303 123 7300\")) # Phone number of Buckingham Palace is +44 303 123 7300"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Glossary\n",
    "### Terms and definitions from Course 2, Module 3\n",
    "**Alteration:** RegEx that matches any one of the alternatives separated by the pipe symbol.\n",
    "\n",
    "**Backreference:** This is applied when using re.sub( ) to substitute the value of a capture group into the output.\n",
    "\n",
    "**Character classes:** These are written inside square brackets and let us list the characters we want to match inside of those brackets.\n",
    "\n",
    "**Character ranges:** Ranges used to match a single character against a set of possibilities.\n",
    "\n",
    "**grep:** An especially easy to use yet extremely powerful tool for applying RegExes.\n",
    "\n",
    "**Lookahead:** RegEx that matches a pattern only if it’s followed by another pattern.\n",
    "\n",
    "**Regular expression:** A search query for text that's expressed by string pattern, also known as RegEx or RegExp.\n",
    "\n",
    "**Wildcard:** A character that can match more than one character.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python3\n",
    "import re\n",
    "import csv\n",
    "\n",
    "def contains_domain(address, domain):\n",
    "  \"\"\"Returns True if the email address contains the given,domain,in the domain position, false if not.\"\"\"\n",
    "  domain_pattern = r'[\\w\\.-]+@'+domain+'$'\n",
    "  if re.match(domain_pattern,address):\n",
    "    return True\n",
    "  return False\n",
    "\n",
    "def replace_domain(address, old_domain, new_domain):\n",
    "  \"\"\"Replaces the old domain with the new domain in the received address.\"\"\"\n",
    "  old_domain_pattern = r'' + old_domain + '$'\n",
    "  address = re.sub(old_domain_pattern, new_domain, address)\n",
    "  return address\n",
    "\n",
    "def main():\n",
    "  \"\"\"Processes the list of emails, replacing any instances of the old domain with the new domain.\"\"\"\n",
    "  old_domain, new_domain = 'abc.edu', 'xyz.edu'\n",
    "  csv_file_location = '<csv_file_location>'\n",
    "  report_file = '<data-directory>' + '/updated_user_emails.csv'\n",
    "  user_email_list = []\n",
    "  old_domain_email_list = []\n",
    "  new_domain_email_list = []\n",
    "\n",
    "  with open(csv_file_location, 'r') as f:\n",
    "    user_data_list = list(csv.reader(f))\n",
    "    user_email_list = [data[1].strip() for data in user_data_list[1:]]\n",
    "\n",
    "    for email_address in user_email_list:\n",
    "      if contains_domain(email_address, old_domain):\n",
    "        old_domain_email_list.append(email_address)\n",
    "        replaced_email = replace_domain(email_address,old_domain,new_domain)\n",
    "        new_domain_email_list.append(replaced_email)\n",
    "\n",
    "    email_key = ' ' + 'Email Address'\n",
    "    email_index = user_data_list[0].index(email_key)\n",
    "\n",
    "    for user in user_data_list[1:]:\n",
    "      for old_domain, new_domain in zip(old_domain_email_list, new_domain_email_list):\n",
    "        if user[email_index] == ' ' + old_domain:\n",
    "          user[email_index] = ' ' + new_domain\n",
    "  f.close()\n",
    "\n",
    "  with open(report_file, 'w+') as output_file:\n",
    "    writer = csv.writer(output_file)\n",
    "    writer.writerows(user_data_list)\n",
    "    output_file.close()\n",
    "\n",
    "main()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}