From eb1a27d0c48baf4b180003f079553b7447cda18f Mon Sep 17 00:00:00 2001 From: radoskov <radoslav.skoviera@cvut.cz> Date: Wed, 5 Mar 2025 12:41:19 +0100 Subject: [PATCH] Finished lecture 3 --- src/pge_lectures/lecture_03/awesome_table.md | 7 + src/pge_lectures/lecture_03/l3_strings_io.qmd | 191 +++++++++++++++++- 2 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 src/pge_lectures/lecture_03/awesome_table.md diff --git a/src/pge_lectures/lecture_03/awesome_table.md b/src/pge_lectures/lecture_03/awesome_table.md new file mode 100644 index 0000000..8e109fa --- /dev/null +++ b/src/pge_lectures/lecture_03/awesome_table.md @@ -0,0 +1,7 @@ +| id | name | volume | radius | max_prop | contained_object | +| --- | --- | --- | --- | --- | --- | +| 0 | Teapot | 1.4 | 0.12 | Awesomeness | Apple, Banana | +| 1 | Blender | 0.8 | 0.25, 0.15 | Coolness Factor | Carrot | +| 2 | Mug, Cup | 1.9 | 0.08 | Radness Level | Potato | +| 3 | Saucepan | 0.5 | 0.22 | Funkiness Quotient | Toast, Bread | +| 4 | Pitcher | 1.1 | 0.18 | Grooviness Index | Coffee Bean | diff --git a/src/pge_lectures/lecture_03/l3_strings_io.qmd b/src/pge_lectures/lecture_03/l3_strings_io.qmd index c5d9c92..69e3002 100644 --- a/src/pge_lectures/lecture_03/l3_strings_io.qmd +++ b/src/pge_lectures/lecture_03/l3_strings_io.qmd @@ -766,12 +766,98 @@ def search(sentence, word): search(sentence, "need") search(sentence, "sentence") +search(sentence, "long sen") search(sentence, "loop") ``` +For whole word search, this would also work (and actually be more efficient): + +```{python} +def search_word(sentence, word): + for sword in sentence.split(): + if sword == word: + print(f"Found '{word}'") + break + else: # this gets executed if we don't 'break out' of the loop + print(f"Could not find '{word}'") + +search_word(sentence, "need") +search_word(sentence, "loop") +search_word(sentence, "long sen") # not a whole word +``` + It is also possible to loop through a string using comprehension syntax. This can make the code more readable but it is appropriate only for simple tasks. +```{python} +sentence = "This is a very long sentence where you need to find something." +# split the sentence by spaces and loop through 'words' +words = [word for word in sentence.split() if word == "need"] +print(words) +``` + +### String similarity + +The 'standard' method of comparing strings using `==` provides a **hard** comparison: +the strings are either the same or not. Sometimes, however, we might need a **soft** +comparison, i.e., we might want to measure **similarity** of the strings. +There is a group of string (or any sequence, actually) similarity measures, +called [**edit distances**](https://en.wikipedia.org/wiki/Edit_distance). +Examples of edit distances is the [**Levenshtein distance**](https://en.wikipedia.org/wiki/Levenshtein_distance), +[Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) +or the [Longest Common Subsequence](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem). +They are called edit distances, since they measure how many changes, i.e., 'edits' +need to be done to one string in order to transform it to the other string. +These measures differ in what types of edits are allowed: +- substitutions (change a character into another) +- deletions +- insertions +- transpositions ("moving" characters around; e.g., "abc" vs "bca" - 'a' moved to the end) + +Here is an example of the Hamming distance: + +```{python} +def hamming_distance(s1, s2): + """ + Calculate the Hamming distance between two equal-length strings. + Returns the number of positions where the characters differ. + """ + if len(s1) != len(s2): + raise ValueError("Strings must be of equal length") + return sum(c1 != c2 for c1, c2 in zip(s1, s2)) + +print(hamming_distance("karolina", "kathrina")) +``` + +Hamming distance allows only substitutions, therefore, the strings must be of equal length. + +There is also a build-in library in Python computing similarity of strings (texts), +called [`difflib`](https://docs.python.org/3/library/difflib.html). + +```{python} +from difflib import SequenceMatcher + +def similarity(s1, s2): + matcher = SequenceMatcher(None, s1, s2) + return matcher.ratio() + +def longest_common_subsequence(s1, s2): + matcher = SequenceMatcher(None, s1, s2) + lcs = matcher.find_longest_match(0, len(s1), 0, len(s2)) + return s1[lcs.a : lcs.a + lcs.size] + +s1 = "This is a very long sentence where you need to find something." +s2 = "This is a very long sentence." +s3 = "This is not a can of words where you need to fly something, or whatever." + +print(f'{"s1 self-similarity":<20}: {similarity(s1, s1)}') +print(f'{"s1 self-lcs":<20}: "{longest_common_subsequence(s1, s1)}"') +print(f'{"s1 to s2 similarity":<20}: {similarity(s1, s2)}') +print(f'{"s1 to s2 lcs":<20}: "{longest_common_subsequence(s1, s2)}"') +print(f'{"s1 to s3 similarity":<20}: {similarity(s1, s3)}') +print(f'{"s1 to s3 lcs":<20}: "{longest_common_subsequence(s1, s3)}"') +``` + ## Files ### File path prelude @@ -1014,4 +1100,107 @@ The created image: {width=20%} -## Parsing strings +## Parsing strings and structured output + +### Parsing structured strings + +Parsing strings means separating strings into some meaningful "tokens" (bits of string with some predefined meaning). +There are multiple ways how to do it. We will look at parsing using stacks & queues later, when we discuss stack and queue ADTs. +Here, we will show how to parse strings with the `split` method. + +Let's first load some data: + +```{python} +import os +table_path = os.path.join(os.getcwd(), "awesome_table.md") + +if os.path.exists(table_path): + with open(table_path, "r") as f: + table = f.read() +else: + print("File 'awesome_table.md' does not exist, for some reason.") + +print("Here is some table:") +print(table) +``` + +Now, we want to parse the data: +1) Firstly, extract field names from the table header + +```{python} +table_lines = table.split("\n") # split by lines + +# Extract field names +field_names = table_lines[0].strip("| ").split("|") # split by the vertical line +field_names = [name.strip() for name in field_names] # remove whitespace + +print("Field names:", field_names) +``` + +2) Then, extract data from each row and put these as a separate "record" (dictionary) into a list. Each value for a field is separated by a vertical line (pipe). However, a field might have multiple values, separated by comma. We want to split those and store them in a list. + +```{python} +records = [] +for line in table_lines[2:]: # skip the header and the separator + split_clean_line = line.strip("| ").split("|") + if len(split_clean_line) < len(field_names): + continue + print(f"Line: {split_clean_line}", line) + record = {} + for i, value in enumerate(split_clean_line): + values = value.strip().split(",") + if len(values) > 1: + record[field_names[i]] = [v.strip() for v in values] + else: + record[field_names[i]] = values[0] + records.append(record) +``` + +3) Finally, print the data: + +```{python} +for ri, record in enumerate(records): + print(f"Record {ri}:") + for field, value in record.items(): + print(f"\t{field:<17}: {str(value)}") +``` + +Be careful when splitting text by a character. Sometimes, +the same character might a be a part of text, e.g.: + +```{python} +# split by comma but only if there is a space between the comma and the next word +text_to_split_by_comma = "up, down, apple,banana, cucumber" +print(f"Wrong splitting: {text_to_split_by_comma.split(',')}") +print(f"Right splitting: {text_to_split_by_comma.split(', ')}") +``` + +This was a simple example but sometimes, thing might get more tricky. +E.g., if comma is used between numbers = don't split, otherwise split. +That is, we want to split also when comma is used without space between letters. +In such cases, we can either loop through the text and replace commas +between numbers with another character (we can then replace it back). +Or, we can use what's called [regular expressions](https://en.wikipedia.org/wiki/Regular_expression). +We will, however, not go into that topic here. + +### Structured output + +we want to print the parsed data back into a 'nice' table: + +```{python} +column_width = 18 +header = '|' + '|'.join([f"{name:^{column_width}}" for name in field_names]) + '|' +print(header) +separator = '|' + '|'.join(['-' * column_width for name in field_names]) + '|' +print(separator) +rows = [] +for ri, record in enumerate(records): + row = [] + for value in record.values(): + if isinstance(value, list): + row.append(f"{', '.join(value):^{column_width}}") + else: + row.append(f"{value:^{column_width}}") + rows.append('|' + '|'.join(row) + '|') +print('\n'.join(rows)) +``` -- GitLab