Using_Python_to_Interact_wi.../module3.ipynb
2024-12-30 19:30:31 +03:00

789 lines
35 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regular Expressions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Matching"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<re.Match object; span=(2, 5), match='aza'>\n",
"<re.Match object; span=(1, 4), match='aza'>\n",
"None\n",
"<re.Match object; span=(0, 1), match='x'>\n",
"<re.Match object; span=(4, 8), match='ping'>\n",
"<re.Match object; span=(1, 5), match='pong'>\n",
"<re.Match object; span=(0, 4), match='Pang'>\n",
"<re.Match object; span=(0, 6), match='Python'>\n",
"<re.Match object; span=(18, 22), match='hway'>\n",
"None\n",
"<re.Match object; span=(0, 6), match='cloudy'>\n",
"<re.Match object; span=(0, 6), match='cloud9'>\n",
"<re.Match object; span=(4, 5), match=' '>\n",
"<re.Match object; span=(30, 31), match='.'>\n",
"<re.Match object; span=(7, 10), match='cat'>\n",
"<re.Match object; span=(7, 10), match='dog'>\n",
"<re.Match object; span=(12, 15), match='dog'>\n",
"<re.Match object; span=(7, 10), match='cat'>\n",
"<re.Match object; span=(7, 10), match='dog'>\n",
"<re.Match object; span=(12, 15), match='dog'>\n",
"['dog', 'cat']\n"
]
}
],
"source": [
"import re\n",
"result1 = re.search(r\"aza\", \"plaza\")\n",
"result2 = re.search(r\"aza\", \"bazaar\")\n",
"print(result1)\n",
"print(result2)\n",
"print(re.search(r\"aza\", \"maze\"))\n",
"print(re.search(r\"^x\", \"xenon\"))\n",
"print(re.search(r\"p.ng\", \"clapping\"))\n",
"print(re.search(r\"p.ng\", \"sponge\"))\n",
"print(re.search(r\"p.ng\", \"Pangaea\", re.IGNORECASE))\n",
"print(re.search(r\"[Pp]ython\", \"Python\"))\n",
"print(re.search(r\"[a-z]way\", \"The end of the highway\"))\n",
"print(re.search(r\"[a-z]way\", \"What a way to go\"))\n",
"print(re.search(\"cloud[a-zA-Z0-9]\", \"cloudy\"))\n",
"print(re.search(\"cloud[a-zA-Z0-9]\", \"cloud9\"))\n",
"print(re.search(r\"[^a-zA-Z]\", \"This is a sentence with spaces.\"))\n",
"print(re.search(r\"[^a-zA-Z ]\", \"This is a sentence with spaces.\"))\n",
"print(re.search(r\"cat|dog\", \"I like cats.\"))\n",
"print(re.search(r\"cat|dog\", \"I love dogs!\"))\n",
"print(re.search(r\"cat|dog\", \"I like both dogs and cats.\"))\n",
"print(re.search(r\"cat|dog\", \"I like cats.\"))\n",
"print(re.search(r\"cat|dog\", \"I love dogs!\"))\n",
"print(re.search(r\"cat|dog\", \"I like both dogs and cats.\"))\n",
"print(re.findall(r\"cat|dog\", \"I like both dogs and cats.\"))"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"False\n",
"True\n",
"True\n",
"False\n",
"True\n",
"True\n",
"False\n"
]
}
],
"source": [
"import re \n",
"\n",
"# This function checks if a given text contains any sequence that has 'a' followed by anything, then 'e' followed by anything, and ends with 'i'. Returns True or False accordingly.\n",
"def check_aei (text):\n",
" result = re.search(r\"a.e.i\", text)\n",
" return result != None\n",
"\n",
"print(check_aei(\"academia\")) # This should return: True\n",
"print(check_aei(\"aerial\")) # This should return: False\n",
"print(check_aei(\"paramedic\")) # This should return: True\n",
"\n",
"# This function checks if a given text contains any punctuation marks (comma, period, colon, semicolon, question mark or exclamation point). Returns True or False accordingly.\n",
"def check_punctuation (text):\n",
" result = re.search(r\"[,.:;?!]\", text)\n",
" return result != None\n",
"\n",
"print(check_punctuation(\"This is a sentence that ends with a period.\")) # This should return: True\n",
"print(check_punctuation(\"This is a sentence fragment without a period\")) # This should return: False\n",
"print(check_punctuation(\"Aren't regular expressions awesome?\")) # This should return: True\n",
"print(check_punctuation(\"Wow! We're really picking up some steam now!\")) # This should return: True\n",
"print(check_punctuation(\"End of the line\")) # This should return: False\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<re.Match object; span=(0, 9), match='Pygmalion'>\n",
"<re.Match object; span=(0, 17), match='Python Programmin'>\n",
"<re.Match object; span=(0, 6), match='Python'>\n",
"<re.Match object; span=(0, 3), match='Pyn'>\n",
"<re.Match object; span=(1, 3), match='ol'>\n",
"<re.Match object; span=(1, 5), match='ooll'>\n",
"None\n",
"<re.Match object; span=(3, 7), match='each'>\n",
"<re.Match object; span=(7, 12), match='peach'>\n"
]
}
],
"source": [
"import re \n",
"print(re.search(r\"Py.*n\", \"Pygmalion\"))\n",
"# re.search() function returns a match object if it finds the pattern 'Py followed by any characters ending in n' within the\n",
"# string \"Pygmalion\". The '.' means any character (except newline), and '*' means zero or more repetitions of the preceding\n",
"# RE. Here, the RE is '.' which can mean anything. So this search looks for occurrences of the letter 'P' followed by one\n",
"# or more characters, then 'n'. In \"Pygmalion\", it finds and returns a match object for \"Pygmalion\". \n",
"print(re.search(r\"Py.*n\", \"Python Programming\"))\n",
"# Here we are searching for the pattern in the string Python Programming which also includes 'P' at start followed by any\n",
"# characters ending with 'n'. So it returns a match object for \"Python\" from the word \"Python Programming\".\n",
"print(re.search(r\"Py[a-z]*n\", \"Python Programming\"))\n",
"# Here we are using a character set '[a-z]' which means any lowercase letter. So it allows 'P' followed by zero or more\n",
"# lowercase letters ending with 'n'. In this case, the match object is for \"Python\" from the word Python Programming as all\n",
"# characters in between P and n are lowercase.\n",
"print(re.search(r\"Py[a-z]*n\", \"Pyn\"))\n",
"# Here we search a pattern where 'P' followed by zero or more lowercase letters ending with 'n' in the string 'Pyn' itself. \n",
"# It returns match object for whole string Pyn as it satisfies our RE conditions.\n",
"print(re.search(r\"o+l+\", \"goldfish\"))\n",
"# Here we are looking for one or more 'o' followed by one or more 'l'. In the word goldfish, there is no 'o' followed by 'l'\n",
"# so it returns None.\n",
"print(re.search(r\"o+l+\", \"woolly\"))\n",
"# Here we are looking for one or more 'o' followed by one or more 'l'. In the word wooly there is 'oo' and both l's, so it\n",
"# returns match object for whole word wooly.\n",
"print(re.search(r\"o+l+\", \"boil\"))\n",
"# Here we are looking for one or more 'o' followed by one or more 'l'. In the word boil, there is only one o and two ls.\n",
"# So it returns match object for whole word boil. \n",
"print(re.search(r\"p?each\", \"To each their own\"))\n",
"# Here we are looking for an optional 'p' followed by 'each' as 'p' can occur zero or one time. In the string \n",
"# \"To each their own\", it returns None because there is no 'p' before 'each'. \n",
"print(re.search(r\"p?each\", \"I like peaches\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The repeating_letter_a function checks if the text passed includes the letter \"a\" (lowercase or uppercase) at least twice.\n",
"# For example, repeating_letter_a(\"banana\") is True, while repeating_letter_a(\"pineapple\") is False.# Fill in the code to\n",
"# make this work. \n",
"import re\n",
"def repeating_letter_a(text):\n",
" result = re.search(r\"[Aa].*[Aa]\", text)\n",
" return result != None\n",
"\n",
"print(repeating_letter_a(\"banana\")) # True\n",
"print(repeating_letter_a(\"pineapple\")) # False\n",
"print(repeating_letter_a(\"Animal Kingdom\")) # True\n",
"print(repeating_letter_a(\"A is for apple\")) # True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"# This line of code uses regular expressions (regex) to search for any occurrence of '.com' in the word \"welcome\". \n",
"print(re.search(r\".com\", \"welcome\")) # <re.Match object; span=(2, 6), match='lcom'>\n",
"# This line of code uses regular expressions (regex) to search for any occurrence of '\\.com' in the word \"welcome\". \n",
"# The output will be None because there is no '.com' within the string 'welcome'. \n",
"# The backslash before '.' in the regex pattern escapes the period, making it match a literal period character rather than any character (as the period itself does in regex).\n",
"print(re.search(r\"\\.com\", \"welcome\")) # None\n",
"# This line of code uses regular expressions (regex) to search for any occurrence of '.com' in the word \"mydomain.com\". \n",
"print(re.search(r\"\\.com\", \"mydomain.com\")) # <re.Match object; span=(8, 12), match='.com'>\n",
"# In this \\w matches any alphanumeric character and '*' denotes zero or more repetitions in the word \"This is an example\". \n",
"print(re.search(r\"\\w*\", \"This is an example\")) # <re.Match object; span=(0, 4), match='This'>\n",
"# This line of code uses regular expressions (regex) to search for any occurrence of '\\w*' in the word \"And_this_is_another\". \n",
"print(re.search(r\"\\w*\", \"And_this_is_another\")) # <re.Match object; span=(0, 19), match='And_this_is_another'>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Fill in the code to check if the text passed has at least 2 groups of alphanumeric characters\n",
"# (including letters, numbers, and underscores) separated by one or more whitespace characters.\n",
"import re\n",
"def check_character_groups(text):\n",
" result = re.search(r\"\\w\\s\\w\", text)\n",
" return result != None\n",
"\n",
"print(check_character_groups(\"One\")) # False\n",
"print(check_character_groups(\"123 Ready Set GO\")) # True\n",
"print(check_character_groups(\"username user_01\")) # True\n",
"print(check_character_groups(\"shopping_list: milk, bread, eggs.\")) # False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re # import Python's regular expressions module\n",
"# This searches for a pattern in 'Argentina' that starts with an \"A\" and ends with an \"a\". The .* is a wildcard that can match any character (except newline) between A and a. It returns the matching object span.\n",
"print(re.search(r\"A.*a\", \"Argentina\")) # <_sre.SRE_Match object; span=(0, 9), match='Argentina'>\n",
"# Same as previous search but in 'Azerbaijan'. It also returns the matching object span.\n",
"print(re.search(r\"A.*a\", \"Azerbaijan\")) # <_sre.SRE_Match object; span=(0, 9), match='Azerbaija'>\n",
"# This checks if the entire string in 'Australia' starts with an \"A\" and ends with an \"a\". The ^ signifies start of a line and $ signifies end of a line. It returns None because Australia doesn't have a full stop at the end.\n",
"print(re.search(r\"^A.*a$\", \"Australia\")) # <re.Match object; span=(0, 9), match='Australia'>\n",
"# This is a pattern that matches a string if it starts with alphanumeric character (including underscore) and then followed by any number of alphanumeric characters or underscores. \n",
"pattern = r\"^[a-zA-Z_][a-zA-Z0-9_]*$\" # valid variable pattern in Python according to the standard conventions\n",
"# This searches if \"_this_is_a_valid_variable_name\" matches the pattern. It returns the matching object span because it does match the pattern.\n",
"print(re.search(pattern, \"_this_is_a_valid_variable_name\")) # <_sre.SRE_Match object; span=(0, 28), match='_this_is_a_valid_variable_name'>\n",
"# This searches if \"this isn't a valid variable\" matches the pattern. It returns None because it contains space which is not allowed in Python variables according to standard conventions.\n",
"print(re.search(pattern, \"this isn't a valid variable\")) # None\n",
"# This searches if \"my_variable1\" matches the pattern. It returns matching object span as it does match the pattern.\n",
"print(re.search(pattern, \"my_variable1\")) # <_sre.SRE_Match object; span=(0, 12), match='my_variable1'>\n",
"# This searches if \"2my_variable1\" matches the pattern. It returns None because it starts with a digit which is not allowed in Python variables according to standard conventions.\n",
"print(re.search(pattern, \"2my_variable1\")) # None\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Fill in the code to check if the text passed looks like a standard sentence, meaning that it starts with an\n",
"# uppercase letter, followed by at least some lowercase letters or a space, and ends with a period, question\n",
"# mark, or exclamation point. \n",
"import re\n",
"def check_sentence(text):\n",
" result = re.search(r\"^[A-Z][a-z\\s].*[\\.?!]$\", text)\n",
" return result != None\n",
"\n",
"print(check_sentence(\"Is this is a sentence?\")) # True\n",
"print(check_sentence(\"is this is a sentence?\")) # False\n",
"print(check_sentence(\"Hello\")) # False\n",
"print(check_sentence(\"1-2-3-GO!\")) # False\n",
"print(check_sentence(\"A star is born.\")) # True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"r”\\d{3}-\\d{3}-\\d{4}” This line of code matches U.S. phone numbers in the format 111-222-3333.\n",
"r”^-?\\d*(\\.\\d+)?$” This line of code matches any positive or negative number, with or without decimal places.\n",
"r”^(.+)\\/([^\\/]+)\\/” This line of code matches any path and filename."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#The check_web_address() function checks if the text passed qualifies as a top-level web address,\n",
"# meaning that it contains alphanumeric characters (which includes letters, numbers, and underscores),\n",
"# as well as periods, dashes, and a plus sign, followed by a period and a character-only top-level\n",
"# domain such as \".com\", \".info\", \".edu\", etc. Fill in the regular expression to do that, using escape\n",
"# characters, wildcards, repetition qualifiers, beginning and end-of-line characters, and character classes.\n",
"import re\n",
"def check_web_address(text):\n",
" pattern = r\"^[A-Za-z0-9_.-]*\\.[A-Za-z]+$\"\n",
" result = re.search(pattern, text)\n",
" return result != None\n",
"\n",
"print(check_web_address(\"gmail.com\")) # True\n",
"print(check_web_address(\"www@google\")) # False\n",
"print(check_web_address(\"www.Coursera.org\")) # True\n",
"print(check_web_address(\"web-address.com/homepage\")) # False\n",
"print(check_web_address(\"My_Favorite-Blog.US\")) # True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def check_time(text):\n",
" pattern = r\"[0-9][:][0-5][0-9]\\s?(am|AM|pm|PM)\"\n",
" result = re.search(pattern, text)\n",
" return result != None\n",
"\n",
"print(check_time(\"12:45pm\")) # True\n",
"print(check_time(\"9:59 AM\")) # True\n",
"print(check_time(\"6:60am\")) # False\n",
"print(check_time(\"five o'clock\")) # False\n",
"print(check_time(\"6:02 am\")) # True\n",
"print(check_time(\"6:02km\")) # False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def contains_acronym(text):\n",
" pattern = r\".*\\([A-Za-z0-9]+\\).*\"\n",
" result = re.search(pattern, text)\n",
" return result != None\n",
"\n",
"print(contains_acronym(\"Instant messaging (IM) is a set of communication technologies used for text-based communication\")) # True\n",
"print(contains_acronym(\"American Standard Code for Information Interchange (ASCII) is a character encoding standard for electronic communication\")) # True\n",
"print(contains_acronym(\"Please do NOT enter without permission!\")) # False\n",
"print(contains_acronym(\"PostScript is a fourth-generation programming language (4GL)\")) # True\n",
"print(contains_acronym(\"Have fun using a self-contained underwater breathing apparatus (Scuba)!\")) # True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def check_zip_code (text):\n",
" result = re.search(r\"^.*\\s(\\d{5})(-\\d{4})?.*$\", text)\n",
" return result != None\n",
"\n",
"print(check_zip_code(\"The zip codes for New York are 10001 thru 11104.\")) # True\n",
"print(check_zip_code(\"90210 is a TV show\")) # False\n",
"print(check_zip_code(\"Their address is: 123 Main Street, Anytown, AZ 85258-0001.\")) # True\n",
"print(check_zip_code(\"The Parliament of Canada is at 111 Wellington St, Ottawa, ON K1A0A9.\")) # False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Advanced Matching"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"# The regex is searching for a string that starts with a word (group 1) followed by a comma and space (group 2)\n",
"# then ends with another word\n",
"result = re.search(r\"^(\\w*), (\\w*)$\", \"Lovelace, Ada\")\n",
"print(result)\n",
"print(result.groups())\n",
"print(result[0])\n",
"print(result[1])\n",
"print(result[2])\n",
"\"{} {}\".format(result[2], result[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Groups"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lovelace Ada\n",
"Ada Lovelace\n"
]
}
],
"source": [
"import re\n",
"def rearrange_name(name):\n",
" result = re.search(r\"^(\\w*), (\\w*)$\", name)\n",
" if result is None:\n",
" return name\n",
" return \"{} {}\".format(result[1], result[2])\n",
"def rearrange_surname(name):\n",
" result = re.search(r\"^(\\w*), (\\w*)$\", name)\n",
" if result is None:\n",
" return name\n",
" return \"{} {}\".format(result[2], result[1])\n",
"print(rearrange_name(\"Lovelace, Ada\"))\n",
"print(rearrange_surname(\"Lovelace, Ada\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Fix the regular expression used in the rearrange_name function so that it can match middle names, middle initials, as well as double surnames.\n",
"import re\n",
"def rearrange_name(name):\n",
"# result = re.search(r\"^(\\w*), (\\w*)$\", name) # from this to:\n",
" result = re.search(r\"^(\\w.*\\w*), (\\w.*\\w*)$\", name)\n",
" if result == None:\n",
" return name\n",
" return \"{} {}\".format(result[2], result[1])\n",
"\n",
"name=rearrange_name(\"Kennedy, John F.\")\n",
"print(name)\n",
"\n",
"# Output:\n",
"# John F. Kennedy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Repetition Qualifiers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"print(re.search(r\"[a-zA-Z]{5}\", \"a ghost\")) # This line searches for any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a ghost\". It won't find a match because there is no such sequence, so it returns None.\n",
"print(re.search(r\"[a-zA-Z]{5}\", \"a scary ghost appeared\")) # This line searches for any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a scary ghost appeared\". It will find a match for 'scary' and return it as a Match object.\n",
"print(re.findall(r\"[a-zA-Z]{5}\", \"a scary ghost appeared\")) # This line finds all (non-overlapping) occurrences of any alphabetic character (a-z or A-Z) sequence of length 5 in the string \"a scary ghost appeared\". It will find matches for 'scary' and 'ghost', returning them as a list of strings ['scary', 'ghost'].\n",
"re.findall(r\"\\b[a-zA-Z]{5}\\b\", \"A scary ghost appeared\") # This line finds all (non-overlapping) occurrences of any alphabetic character (a-z or A-Z) sequence of length 5 that are separate words in the string \"A scary ghost appeared\". It won't find a match for 'scary' and 'ghost' because they aren't standalone words, so it returns an empty list.\n",
"print(re.findall(r\"\\w{5,10}\", \"I really like strawberries\")) # This line finds all (non-overlapping) occurrences of a word composed of alphanumeric characters that is between 5 and 10 characters long in the string \"I really like strawberries\". It will find matches for 'really' and 'strawberries', returning them as a list of strings ['really', 'strawberries'].\n",
"print(re.findall(r\"\\w{5,}\", \"I really like strawberries\")) # This line finds all (non-overlapping) occurrences of a word composed of alphanumeric characters that is at least 5 characters long in the string \"I really like strawberries\". It will find matches for 'really' and 'strawberries', returning them as a list of strings ['really', 'strawberries'].\n",
"print(re.search(r\"s\\w{,20}\", \"I really like strawberries\")) # This line searches for any word that starts with the letter s followed by less than or equal to 20 alphanumeric characters in the string \"I really like strawberries\". It will find a match for 'strawberries' because it starts with 's' and is followed by fewer than or equal to 20 alphanumeric characters, returning it as a list of strings ['strawberries'].\n",
"\n",
"# Output:\n",
"# <re.Match object; span=(2, 7), match='ghost'>\n",
"# <re.Match object; span=(2, 7), match='scary'>\n",
"# ['scary', 'ghost', 'appea']\n",
"# ['really', 'strawberri']\n",
"# ['really', 'strawberries']\n",
"# <re.Match object; span=(14, 26), match='strawberries'>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extracting a PID using regexes in Python\n",
"import re\n",
"log = \"July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade\"\n",
"regex = r\"\\[(\\d+)\\]\"\n",
"result = re.search(regex, log)\n",
"result = re.search(regex, \"A completely different string that also has numbers [34567]\")\n",
"result = re.search(regex, \"99 elephants in a [cage]\")\n",
"def extract_pid(log_line):\n",
" regex = r\"\\[(\\d+)\\]\"\n",
" result = re.search(regex, log_line)\n",
" if result is None:\n",
" return \"\"\n",
" return result[1]\n",
"print(extract_pid(log))\n",
"print(extract_pid(\"99 elephants in a [cage]\"))\n",
"# Output:\n",
"# 12345"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def extract_pid(log_line):\n",
" regex = r\"\\[(\\d+)\\]: (\\w+)\" # Modify regex to return uppercase message in parenthesis after process ID\n",
" result = re.search(regex, log_line)\n",
" if result is None:\n",
" return None\n",
" pid = result.groups()[0] # Fetch the first group (process id)\n",
" message = result.groups()[1] # fetch the second group (uppercase message)\n",
" return \"{} ({})\".format(pid, message) \n",
"\n",
"print(extract_pid(\"July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade\")) # 12345 (ERROR)\n",
"print(extract_pid(\"99 elephants in a [cage]\")) # None\n",
"print(extract_pid(\"A string that also has numbers [34567] but no uppercase message\")) # None\n",
"print(extract_pid(\"July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup\")) # 67890 (RUNNING)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Splitting and replacing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"print(re.split(r\"[.?!]\", \"One sentence. Another one? And the last one!\"))\n",
"print(re.split(r\"([.?!])\", \"One sentence. Another one? And the last one!\"))\n",
"print(re.sub(r\"[\\w.%+-]+@[\\w.-]+\", \"[REDACTED]\", \"Received an email for go_nuts95@my.example.com\"))\n",
"print(re.sub(r\"^([\\w .-]*), ([\\w .-]*)$\", r\"\\2 \\1\", \"Lovelace, Ada\"))\n",
"\n",
"# Output:\n",
"# ['One sentence', ' Another one', ' And the last one', '']\n",
"# ['One sentence', '.', ' Another one', '?', ' And the last one', '!', '']\n",
"# Received an email for [REDACTED]\n",
"# Ada Lovelace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sabrina Green,+1-802-867-5309,System Administrator\n",
"Eli Jones,+1-684-3481127,IT specialist\n",
"Melody Daniels,+1-846-687-7436,Programmer\n",
"Charlie Rivera,+1-698-746-3357,Web Developer\n"
]
}
],
"source": [
"# Question 1 X -> V\n",
"# Youre working with a CSV file that contains employee information. Each record has a name field, followed by a phone number field, and a role field. The phone number field contains U.S. phone numbers and needs to be modified to the international format, with +1- in front of the phone number. The rest of the phone number should not change. Fill in the regular expression, using groups, to use the transform_record() function to do that.\n",
"\n",
"import re\n",
"def transform_record(record):\n",
" new_record = re.sub(r'(\\w+ \\w+),(\\d{3}-\\d{7}|\\d{3}-\\d{3}-\\d{4}),(\\w+\\s*\\w+)', r'\\1,+1-\\2,\\3', record)\n",
" return new_record\n",
"\n",
"print(transform_record(\"Sabrina Green,802-867-5309,System Administrator\")) # Sabrina Green,+1-802-867-5309,System Administrator\n",
"print(transform_record(\"Eli Jones,684-3481127,IT specialist\")) # Eli Jones,+1-684-3481127,IT specialist\n",
"print(transform_record(\"Melody Daniels,846-687-7436,Programmer\")) # Melody Daniels,+1-846-687-7436,Programmer\n",
"print(transform_record(\"Charlie Rivera,698-746-3357,Web Developer\")) # Charlie Rivera,+1-698-746-3357,Web Developer\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Question 2 V\n",
"# The multi_vowel_words() function returns all words with 3 or more consecutive vowels (a, e, i, o, u). Fill in the regular expression to do that.\n",
"import re\n",
"def multi_vowel_words(text):\n",
" pattern = r'\\b\\w*[aeiou]{3}\\w*\\b'\n",
" result = re.findall(pattern, text)\n",
" return result\n",
"\n",
"print(multi_vowel_words(\"Life is beautiful\")) # ['beautiful']\n",
"print(multi_vowel_words(\"Obviously, the queen is courageous and gracious.\")) # ['Obviously', 'queen', 'courageous', 'gracious']\n",
"print(multi_vowel_words(\"The rambunctious children had to sit quietly and await their delicious dinner.\")) # ['rambunctious', 'quietly', 'delicious']\n",
"print(multi_vowel_words(\"The order of a data queue is First In First Out (FIFO)\")) # ['queue']\n",
"print(multi_vowel_words(\"Hello world!\")) # []"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"// Start of program\n",
" number = 0 // Initialize the variable\n",
" number += 1 // Increment the variable\n",
" return(number)\n"
]
}
],
"source": [
"# Question 4 X -> V\n",
"# The transform_comments() function converts comments in a Python script into those usable by a C compiler. This means looking for text that begins with a hash mark (#) and replacing it with double slashes (//), which is the C single-line comment indicator. For the purpose of this exercise, we'll ignore the possibility of a hash mark embedded inside of a Python command, and assume that it's only used to indicate a comment. We also want to treat repetitive hash marks (##), (###), etc., as a single comment indicator, to be replaced with just (//) and not (#//) or (//#). Fill in the parameters of the substitution method to complete this function: \n",
"import re\n",
"def transform_comments(line_of_code):\n",
" result = re.sub('#+', '//', line_of_code)\n",
" return result\n",
"\n",
"print(transform_comments(\"### Start of program\")) # Should be \"// Start of program\"\n",
"print(transform_comments(\" number = 0 ## Initialize the variable\")) # Should be \" number = 0 // Initialize the variable\"\n",
"print(transform_comments(\" number += 1 # Increment the variable\")) # Should be \" number += 1 // Increment the variable\"\n",
"print(transform_comments(\" return(number)\")) # Should be \" return(number)\"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My number is (212) 345-9999.\n",
"Please call (888) 555-1234\n",
"123-123-12345\n",
"Phone number of Buckingham Palace is +44 303 123 7300\n"
]
}
],
"source": [
"# Question 5\n",
"# The convert_phone_number() function checks for a U.S. phone number format: XXX-XXX-XXXX (3 digits followed by a dash, 3 more digits followed by a dash, and 4 digits), and converts it to a more formal format that looks like this: (XXX) XXX-XXXX. Fill in the regular expression to complete this function.\n",
"import re\n",
"def convert_phone_number(phone):\n",
" result = re.sub(r\"\\b(\\d{3})-(\\d{3})-(\\d{4})\\b\", r\"(\\1) \\2-\\3\", phone)\n",
" return result\n",
"\n",
"print(convert_phone_number(\"My number is 212-345-9999.\")) # My number is (212) 345-9999.\n",
"print(convert_phone_number(\"Please call 888-555-1234\")) # Please call (888) 555-1234\n",
"print(convert_phone_number(\"123-123-12345\")) # 123-123-12345\n",
"print(convert_phone_number(\"Phone number of Buckingham Palace is +44 303 123 7300\")) # Phone number of Buckingham Palace is +44 303 123 7300"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Glossary\n",
"### Terms and definitions from Course 2, Module 3\n",
"**Alteration:** RegEx that matches any one of the alternatives separated by the pipe symbol.\n",
"\n",
"**Backreference:** This is applied when using re.sub( ) to substitute the value of a capture group into the output.\n",
"\n",
"**Character classes:** These are written inside square brackets and let us list the characters we want to match inside of those brackets.\n",
"\n",
"**Character ranges:** Ranges used to match a single character against a set of possibilities.\n",
"\n",
"**grep:** An especially easy to use yet extremely powerful tool for applying RegExes.\n",
"\n",
"**Lookahead:** RegEx that matches a pattern only if its followed by another pattern.\n",
"\n",
"**Regular expression:** A search query for text that's expressed by string pattern, also known as RegEx or RegExp.\n",
"\n",
"**Wildcard:** A character that can match more than one character.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Lab"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python3\n",
"import re\n",
"import csv\n",
"\n",
"def contains_domain(address, domain):\n",
" \"\"\"Returns True if the email address contains the given,domain,in the domain position, false if not.\"\"\"\n",
" domain_pattern = r'[\\w\\.-]+@'+domain+'$'\n",
" if re.match(domain_pattern,address):\n",
" return True\n",
" return False\n",
"\n",
"def replace_domain(address, old_domain, new_domain):\n",
" \"\"\"Replaces the old domain with the new domain in the received address.\"\"\"\n",
" old_domain_pattern = r'' + old_domain + '$'\n",
" address = re.sub(old_domain_pattern, new_domain, address)\n",
" return address\n",
"\n",
"def main():\n",
" \"\"\"Processes the list of emails, replacing any instances of the old domain with the new domain.\"\"\"\n",
" old_domain, new_domain = 'abc.edu', 'xyz.edu'\n",
" csv_file_location = '<csv_file_location>'\n",
" report_file = '<data-directory>' + '/updated_user_emails.csv'\n",
" user_email_list = []\n",
" old_domain_email_list = []\n",
" new_domain_email_list = []\n",
"\n",
" with open(csv_file_location, 'r') as f:\n",
" user_data_list = list(csv.reader(f))\n",
" user_email_list = [data[1].strip() for data in user_data_list[1:]]\n",
"\n",
" for email_address in user_email_list:\n",
" if contains_domain(email_address, old_domain):\n",
" old_domain_email_list.append(email_address)\n",
" replaced_email = replace_domain(email_address,old_domain,new_domain)\n",
" new_domain_email_list.append(replaced_email)\n",
"\n",
" email_key = ' ' + 'Email Address'\n",
" email_index = user_data_list[0].index(email_key)\n",
"\n",
" for user in user_data_list[1:]:\n",
" for old_domain, new_domain in zip(old_domain_email_list, new_domain_email_list):\n",
" if user[email_index] == ' ' + old_domain:\n",
" user[email_index] = ' ' + new_domain\n",
" f.close()\n",
"\n",
" with open(report_file, 'w+') as output_file:\n",
" writer = csv.writer(output_file)\n",
" writer.writerows(user_data_list)\n",
" output_file.close()\n",
"\n",
"main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}