From eb1a27d0c48baf4b180003f079553b7447cda18f Mon Sep 17 00:00:00 2001
From: radoskov <radoslav.skoviera@cvut.cz>
Date: Wed, 5 Mar 2025 12:41:19 +0100
Subject: [PATCH] Finished lecture 3

---
 src/pge_lectures/lecture_03/awesome_table.md  |   7 +
 src/pge_lectures/lecture_03/l3_strings_io.qmd | 191 +++++++++++++++++-
 2 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 src/pge_lectures/lecture_03/awesome_table.md

diff --git a/src/pge_lectures/lecture_03/awesome_table.md b/src/pge_lectures/lecture_03/awesome_table.md
new file mode 100644
index 0000000..8e109fa
--- /dev/null
+++ b/src/pge_lectures/lecture_03/awesome_table.md
@@ -0,0 +1,7 @@
+| id | name | volume | radius | max_prop | contained_object |
+| --- | --- | --- | --- | --- | --- |
+| 0 | Teapot | 1.4 | 0.12 | Awesomeness | Apple, Banana |
+| 1 | Blender | 0.8 | 0.25, 0.15 | Coolness Factor | Carrot |
+| 2 | Mug, Cup | 1.9 | 0.08 | Radness Level | Potato |
+| 3 | Saucepan | 0.5 | 0.22 | Funkiness Quotient | Toast, Bread |
+| 4 | Pitcher | 1.1 | 0.18 | Grooviness Index | Coffee Bean |
diff --git a/src/pge_lectures/lecture_03/l3_strings_io.qmd b/src/pge_lectures/lecture_03/l3_strings_io.qmd
index c5d9c92..69e3002 100644
--- a/src/pge_lectures/lecture_03/l3_strings_io.qmd
+++ b/src/pge_lectures/lecture_03/l3_strings_io.qmd
@@ -766,12 +766,98 @@ def search(sentence, word):
 
 search(sentence, "need")
 search(sentence, "sentence")
+search(sentence, "long sen")
 search(sentence, "loop")
 ```
 
+For whole word search, this would also work (and actually be more efficient):
+
+```{python}
+def search_word(sentence, word):
+    for sword in sentence.split():
+        if sword == word:
+            print(f"Found '{word}'")
+            break
+    else:  # this gets executed if we don't 'break out' of the loop
+        print(f"Could not find '{word}'")
+
+search_word(sentence, "need")
+search_word(sentence, "loop")
+search_word(sentence, "long sen")  # not a whole word
+```
+
 It is also possible to loop through a string using comprehension syntax.
 This can make the code more readable but it is appropriate only for simple tasks.
 
+```{python}
+sentence = "This is a very long sentence where you need to find something."
+# split the sentence by spaces and loop through 'words'
+words = [word for word in sentence.split() if word == "need"]
+print(words)
+```
+
+### String similarity
+
+The 'standard' method of comparing strings using `==` provides a **hard** comparison:
+the strings are either the same or not. Sometimes, however, we might need a **soft**
+comparison, i.e., we might want to measure **similarity** of the strings.
+There is a group of string (or any sequence, actually) similarity measures,
+called [**edit distances**](https://en.wikipedia.org/wiki/Edit_distance).
+Examples of edit distances is the [**Levenshtein distance**](https://en.wikipedia.org/wiki/Levenshtein_distance),
+[Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
+or the [Longest Common Subsequence](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem).
+They are called edit distances, since they measure how many changes, i.e., 'edits'
+need to be done to one string in order to transform it to the other string.
+These measures differ in what types of edits are allowed:
+- substitutions (change a character into another)
+- deletions
+- insertions
+- transpositions ("moving" characters around; e.g., "abc" vs "bca" - 'a' moved to the end)
+
+Here is an example of the Hamming distance:
+
+```{python}
+def hamming_distance(s1, s2):
+    """
+    Calculate the Hamming distance between two equal-length strings.
+    Returns the number of positions where the characters differ.
+    """
+    if len(s1) != len(s2):
+        raise ValueError("Strings must be of equal length")
+    return sum(c1 != c2 for c1, c2 in zip(s1, s2))
+
+print(hamming_distance("karolina", "kathrina"))
+```
+
+Hamming distance allows only substitutions, therefore, the strings must be of equal length.
+
+There is also a build-in library in Python computing similarity of strings (texts),
+called [`difflib`](https://docs.python.org/3/library/difflib.html).
+
+```{python}
+from difflib import SequenceMatcher
+
+def similarity(s1, s2):
+    matcher = SequenceMatcher(None, s1, s2)
+    return matcher.ratio()
+
+def longest_common_subsequence(s1, s2):
+    matcher = SequenceMatcher(None, s1, s2)
+    lcs = matcher.find_longest_match(0, len(s1), 0, len(s2))
+    return s1[lcs.a : lcs.a + lcs.size]
+
+s1 = "This is a very long sentence where you need to find something."
+s2 = "This is a very long sentence."
+s3 = "This is not a can of words where you need to fly something, or whatever."
+
+print(f'{"s1 self-similarity":<20}: {similarity(s1, s1)}')
+print(f'{"s1 self-lcs":<20}: "{longest_common_subsequence(s1, s1)}"')
+print(f'{"s1 to s2 similarity":<20}: {similarity(s1, s2)}')
+print(f'{"s1 to s2 lcs":<20}: "{longest_common_subsequence(s1, s2)}"')
+print(f'{"s1 to s3 similarity":<20}: {similarity(s1, s3)}')
+print(f'{"s1 to s3 lcs":<20}: "{longest_common_subsequence(s1, s3)}"')
+```
+
 ## Files
 
 ### File path prelude
@@ -1014,4 +1100,107 @@ The created image:
 
 ![Image](image.png){width=20%}
 
-## Parsing strings
+## Parsing strings and structured output
+
+### Parsing structured strings
+
+Parsing strings means separating strings into some meaningful "tokens" (bits of string with some predefined meaning).
+There are multiple ways how to do it. We will look at parsing using stacks & queues later, when we discuss stack and queue ADTs.
+Here, we will show how to parse strings with the `split` method.
+
+Let's first load some data:
+
+```{python}
+import os
+table_path = os.path.join(os.getcwd(), "awesome_table.md")
+
+if os.path.exists(table_path):
+    with open(table_path, "r") as f:
+        table = f.read()
+else:
+    print("File 'awesome_table.md' does not exist, for some reason.")
+
+print("Here is some table:")
+print(table)
+```
+
+Now, we want to parse the data:
+1) Firstly, extract field names from the table header
+
+```{python}
+table_lines = table.split("\n")  # split by lines
+
+# Extract field names
+field_names = table_lines[0].strip("| ").split("|")  # split by the vertical line
+field_names = [name.strip() for name in field_names]  # remove whitespace
+
+print("Field names:", field_names)
+```
+
+2) Then, extract data from each row and put these as a separate "record" (dictionary) into a list. Each value for a field is separated by a vertical line (pipe). However, a field might have multiple values, separated by comma. We want to split those and store them in a list.
+
+```{python}
+records = []
+for line in table_lines[2:]:  # skip the header and the separator
+    split_clean_line = line.strip("| ").split("|")
+    if len(split_clean_line) < len(field_names):
+        continue
+    print(f"Line: {split_clean_line}", line)
+    record = {}
+    for i, value in enumerate(split_clean_line):
+        values = value.strip().split(",")
+        if len(values) > 1:
+            record[field_names[i]] = [v.strip() for v in values]
+        else:
+            record[field_names[i]] = values[0]
+    records.append(record)
+```
+
+3) Finally, print the data:
+
+```{python}
+for ri, record in enumerate(records):
+    print(f"Record {ri}:")
+    for field, value in record.items():
+        print(f"\t{field:<17}: {str(value)}")
+```
+
+Be careful when splitting text by a character. Sometimes,
+the same character might a be a part of text, e.g.:
+
+```{python}
+# split by comma but only if there is a space between the comma and the next word
+text_to_split_by_comma = "up, down, apple,banana, cucumber"
+print(f"Wrong splitting: {text_to_split_by_comma.split(',')}")
+print(f"Right splitting: {text_to_split_by_comma.split(', ')}")
+```
+
+This was a simple example but sometimes, thing might get more tricky.
+E.g., if comma is used between numbers = don't split, otherwise split.
+That is, we want to split also when comma is used without space between letters.
+In such cases, we can either loop through the text and replace commas
+between numbers with another character (we can then replace it back).
+Or, we can use what's called [regular expressions](https://en.wikipedia.org/wiki/Regular_expression).
+We will, however, not go into that topic here.
+
+### Structured output
+
+we want to print the parsed data back into a 'nice' table:
+
+```{python}
+column_width = 18
+header = '|' + '|'.join([f"{name:^{column_width}}" for name in field_names]) + '|'
+print(header)
+separator = '|' + '|'.join(['-' * column_width for name in field_names]) + '|'
+print(separator)
+rows = []
+for ri, record in enumerate(records):
+    row = []
+    for value in record.values():
+        if isinstance(value, list):
+            row.append(f"{', '.join(value):^{column_width}}")
+        else:
+            row.append(f"{value:^{column_width}}")
+    rows.append('|' + '|'.join(row) + '|')
+print('\n'.join(rows))
+```
-- 
GitLab