🏠

Appendix B: Library API Cheat Sheets

os and pathlib

os.walk()

import os

for root, dirs, files in os.walk(start_path):
    # root: current directory path (string)
    # dirs: list of subdirectory names (not paths)
    # files: list of file names (not paths)
    pass

Key behaviors:

Common operations:

# Get full path to file
full_path = os.path.join(root, filename)

# Skip certain directories
dirs[:] = [d for d in dirs if d not in ['.git', '__pycache__']]

# Count files
file_count = sum(len(files) for _, _, files in os.walk(path))

os.path

import os

# Path manipulation
os.path.join('dir', 'subdir', 'file.txt')  # → 'dir/subdir/file.txt'
os.path.basename('/path/to/file.txt')       # → 'file.txt'
os.path.dirname('/path/to/file.txt')        # → '/path/to'
os.path.split('/path/to/file.txt')          # → ('/path/to', 'file.txt')
os.path.splitext('file.txt')                # → ('file', '.txt')

# Path checks
os.path.exists(path)       # True if exists
os.path.isfile(path)       # True if file
os.path.isdir(path)        # True if directory
os.path.islink(path)       # True if symlink

# Path info
os.path.getsize(path)      # Size in bytes
os.path.getmtime(path)     # Last modified time
os.path.abspath(path)      # Absolute path

pathlib (Modern Alternative)

from pathlib import Path

# Create Path objects
p = Path('/path/to/file.txt')
p = Path.home() / 'documents' / 'file.txt'  # Use / for joining

# Properties
p.name          # 'file.txt'
p.stem          # 'file'
p.suffix        # '.txt'
p.parent        # Path('/path/to')
p.parts         # ('/', 'path', 'to', 'file.txt')

# Checks
p.exists()      # True if exists
p.is_file()     # True if file
p.is_dir()      # True if directory

# Operations
p.read_text()                    # Read entire file as string
p.read_bytes()                   # Read as bytes
p.write_text('content')          # Write string to file
list(p.glob('*.txt'))            # All .txt files in directory
list(p.rglob('*.txt'))           # Recursive glob (all subdirs)
list(p.iterdir())                # Iterate over directory contents

# Walking with pathlib
for path in Path(start).rglob('*.py'):
    if path.is_file():
        process(path)

When to use pathlib:

When to use os:


BeautifulSoup4

Parsing HTML

from bs4 import BeautifulSoup

# Parse HTML
soup = BeautifulSoup(html_string, 'html.parser')
soup = BeautifulSoup(html_string, 'lxml')  # Faster, needs lxml installed

# From file
with open('page.html', 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')

Finding Elements

# Find single element
soup.find('div')                          # First div
soup.find('div', class_='main')           # First div with class="main"
soup.find('div', {'id': 'content'})       # By attribute dict
soup.find(id='content')                   # By keyword argument
soup.find('a', href=True)                 # Any <a> with href attribute

# Find all elements
soup.find_all('p')                        # All <p> tags
soup.find_all(['h1', 'h2', 'h3'])        # Any heading
soup.find_all('div', class_='item')       # All divs with class="item"
soup.find_all('a', limit=5)               # First 5 links

# CSS selectors (often easier)
soup.select('div.main')                   # div with class "main"
soup.select('#content')                   # Element with id="content"
soup.select('div > p')                    # p directly inside div
soup.select('div p')                      # Any p inside div
soup.select('a[href]')                    # Links with href
soup.select_one('div.main')               # First match only
element = soup.find('div')

# Children (direct descendants only)
element.children          # Generator of direct children
list(element.children)    # Convert to list
element.contents          # List of children (includes text nodes)

# Descendants (all nested elements)
element.descendants       # Generator of all nested elements

# Parents
element.parent            # Direct parent
element.parents           # Generator of all parents up to root

# Siblings
element.next_sibling      # Next sibling (may be text node)
element.previous_sibling  # Previous sibling
element.next_siblings     # Generator of all following siblings
element.previous_siblings # Generator of all preceding siblings

Extracting Data

element = soup.find('a')

# Text content
element.get_text()                # All text inside element
element.get_text(strip=True)      # Strip whitespace
element.get_text(separator=' ')   # Join text with separator
element.string                    # Text if element has only one child

# Attributes
element['href']                   # Get href attribute (raises KeyError if missing)
element.get('href')               # Get href or None
element.get('href', 'default')    # Get href or default value
element.attrs                     # Dict of all attributes
element.name                      # Tag name ('a', 'div', etc.)

# Check element type
hasattr(element, 'name')          # True if element, False if text node
element.name == 'a'               # Check tag type

Common Patterns

# Get all links
for link in soup.find_all('a', href=True):
    url = link['href']
    text = link.get_text(strip=True)

# Extract table
table = soup.find('table')
rows = []
for tr in table.find_all('tr'):
    cells = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
    rows.append(cells)

# Find elements with specific text
for element in soup.find_all('p'):
    if 'search term' in element.get_text():
        process(element)

# Navigate parent chain
element = soup.find('span', class_='target')
section = element.find_parent('section')
heading = section.find('h2')

Important Gotchas

# Text nodes don't have .name
for child in element.children:
    if hasattr(child, 'name'):  # It's an element
        process(child)
    else:  # It's a text node
        pass

# .string only works if element has single text child
div.string  # May be None even if div contains text

# Class is a reserved word in Python
element.find(class_='main')     # ✓ Use class_ with underscore
element.find({'class': 'main'}) # ✓ Or use dict

# Multiple classes
element.find(class_=['main', 'active'])  # Either class

Tree-sitter

Setup

import tree_sitter_python as tspython
from tree_sitter import Language, Parser

# Create language and parser
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)

# Parse code (must be bytes)
source_code = "def hello(): pass"
tree = parser.parse(bytes(source_code, 'utf8'))

# Get root node
root = tree.root_node

Node Properties

node = tree.root_node

# Type and identity
node.type                    # String: 'module', 'function_definition', etc.
node.is_named               # True for named nodes, False for syntax (like ':')

# Position in source
node.start_byte             # Start position in bytes
node.end_byte               # End position in bytes
node.start_point            # (row, column) tuple
node.end_point              # (row, column) tuple

# Source code
node.text                   # Source code as bytes
node.text.decode('utf8')    # Decode to string

# Tree structure
node.children               # List of child nodes
node.child_count           # Number of children
node.parent                # Parent node (or None for root)

Accessing Children

# Direct access (recommended for most cases)
node.children               # List of all children
node.children[0]           # First child
node.named_children        # List of named children only (skips syntax tokens)

# By field name (semantic access)
func_node = root.child_by_field_name('name')  # Get specific field
params = root.child_by_field_name('parameters')

# By type
def find_by_type(node, target_type):
    if node.type == target_type:
        return node
    for child in node.children:
        result = find_by_type(child, target_type)
        if result:
            return result

TreeCursor (Advanced)

# Create cursor
cursor = node.walk()

# Manual navigation
cursor.goto_first_child()    # Move to first child (returns False if no children)
cursor.goto_next_sibling()   # Move to next sibling (returns False if none)
cursor.goto_parent()         # Move to parent
cursor.node                  # Current node at cursor position

# Typical pattern
cursor = root.walk()
if cursor.goto_first_child():
    process(cursor.node)
    while cursor.goto_next_sibling():
        process(cursor.node)
    cursor.goto_parent()

When to use TreeCursor:

When NOT to use TreeCursor:

Common Node Types (Python)

# Module level
'module'                    # Root node
'import_statement'          # import os
'import_from_statement'     # from os import path
'function_definition'       # def func():
'class_definition'          # class MyClass:

# Inside functions
'expression_statement'      # Any expression as statement
'call'                      # Function call
'return_statement'          # return value
'if_statement'              # if/elif/else
'for_statement'             # for loop
'while_statement'           # while loop

# Expressions
'binary_operator'           # +, -, *, etc.
'string'                    # String literal
'integer'                   # Integer literal
'identifier'                # Variable name
'attribute'                 # obj.attr
'subscript'                 # list[index]

Field Names (Semantic Access)

# function_definition fields
func_node.child_by_field_name('name')        # Function name
func_node.child_by_field_name('parameters') # Parameter list
func_node.child_by_field_name('body')       # Function body
func_node.child_by_field_name('return_type') # Return type annotation

# class_definition fields
class_node.child_by_field_name('name')      # Class name
class_node.child_by_field_name('superclasses')  # Base classes
class_node.child_by_field_name('body')      # Class body

# call fields
call_node.child_by_field_name('function')   # Function being called
call_node.child_by_field_name('arguments')  # Argument list

Common Patterns

# Find all functions
def find_functions(node):
    if node.type == 'function_definition':
        yield node
    for child in node.children:
        yield from find_functions(child)

# Get function names
functions = []
for func in find_functions(root):
    name_node = func.child_by_field_name('name')
    if name_node:
        functions.append(name_node.text.decode('utf8'))

# Extract source code
def get_source(node, source_bytes):
    return source_bytes[node.start_byte:node.end_byte].decode('utf8')

# Find by position
def node_at_position(root, line, column):
    point = (line, column)
    for node in traverse(root):
        if node.start_point <= point <= node.end_point:
            return node

json and jsonpath-ng

Standard json Module

import json

# Parse JSON
data = json.loads(json_string)        # From string
with open('data.json') as f:
    data = json.load(f)               # From file

# Write JSON
json_string = json.dumps(data)        # To string
json_string = json.dumps(data, indent=2)  # Pretty print
with open('out.json', 'w') as f:
    json.dump(data, f, indent=2)      # To file

Direct Access

# Simple access
value = data['key']
value = data['key']['subkey']
value = data['key'][0]['subkey']

# Safe access
value = data.get('key', default_value)
value = data.get('key', {}).get('subkey', default_value)

# Check existence
if 'key' in data:
    value = data['key']

jsonpath-ng (for complex queries)

from jsonpath_ng import parse

# Find all matches
jsonpath_expr = parse('$.store.book[*].author')
matches = jsonpath_expr.find(data)
authors = [match.value for match in matches]

# Common expressions
'$.store.book[*]'           # All books
'$..author'                 # All authors (recursive)
'$.store.book[0]'           # First book
'$.store.book[-1]'          # Last book
'$.store.book[0:2]'         # First two books
'$.store.book[?(@.price < 10)]'  # Books under $10

# Get paths and values
for match in matches:
    print(f"Path: {match.path}")
    print(f"Value: {match.value}")

When to use jsonpath-ng:

When NOT to use jsonpath-ng:


Quick Comparison Table

Task os.walk() pathlib BeautifulSoup Tree-sitter
Find all X Manual filter .rglob('*.ext') .find_all('tag') Recursive search
Get children files list .iterdir() .children .children
Get text Read file .read_text() .get_text() .text.decode()
Filter Modify dirs Generator expr .find_all(cond) Check .type
Memory efficient Generator Generator Returns list Use TreeCursor
Get position N/A N/A N/A .start_point

Remember

  1. Try the simple thing first: Direct access before search, built-in methods before custom code
  2. Check the return type: Generator vs list matters for memory and re-iteration
  3. Handle missing data: Use .get(), try-except, or explicit checks
  4. Keep source bytes: Tree-sitter needs original bytes for accurate slicing
  5. Read the docs once: 5 minutes of documentation saves 30 minutes of trial and error