Appendix B: Library API Cheat Sheets
os and pathlib
os.walk()
import os
for root, dirs, files in os.walk(start_path):
# root: current directory path (string)
# dirs: list of subdirectory names (not paths)
# files: list of file names (not paths)
pass
Key behaviors:
- Returns a generator (memory efficient)
rootis always a complete pathdirsandfilesare just names, not paths- Modify
dirsin-place to control which subdirectories are visited
Common operations:
# Get full path to file
full_path = os.path.join(root, filename)
# Skip certain directories
dirs[:] = [d for d in dirs if d not in ['.git', '__pycache__']]
# Count files
file_count = sum(len(files) for _, _, files in os.walk(path))
os.path
import os
# Path manipulation
os.path.join('dir', 'subdir', 'file.txt') # → 'dir/subdir/file.txt'
os.path.basename('/path/to/file.txt') # → 'file.txt'
os.path.dirname('/path/to/file.txt') # → '/path/to'
os.path.split('/path/to/file.txt') # → ('/path/to', 'file.txt')
os.path.splitext('file.txt') # → ('file', '.txt')
# Path checks
os.path.exists(path) # True if exists
os.path.isfile(path) # True if file
os.path.isdir(path) # True if directory
os.path.islink(path) # True if symlink
# Path info
os.path.getsize(path) # Size in bytes
os.path.getmtime(path) # Last modified time
os.path.abspath(path) # Absolute path
pathlib (Modern Alternative)
from pathlib import Path
# Create Path objects
p = Path('/path/to/file.txt')
p = Path.home() / 'documents' / 'file.txt' # Use / for joining
# Properties
p.name # 'file.txt'
p.stem # 'file'
p.suffix # '.txt'
p.parent # Path('/path/to')
p.parts # ('/', 'path', 'to', 'file.txt')
# Checks
p.exists() # True if exists
p.is_file() # True if file
p.is_dir() # True if directory
# Operations
p.read_text() # Read entire file as string
p.read_bytes() # Read as bytes
p.write_text('content') # Write string to file
list(p.glob('*.txt')) # All .txt files in directory
list(p.rglob('*.txt')) # Recursive glob (all subdirs)
list(p.iterdir()) # Iterate over directory contents
# Walking with pathlib
for path in Path(start).rglob('*.py'):
if path.is_file():
process(path)
When to use pathlib:
- More object-oriented, more readable
- Better for modern Python code
rglob()is cleaner thanos.walk()for simple searches
When to use os:
- More control over traversal (modifying dirs list)
- Working with older codebases
- Some operations are faster
BeautifulSoup4
Parsing HTML
from bs4 import BeautifulSoup
# Parse HTML
soup = BeautifulSoup(html_string, 'html.parser')
soup = BeautifulSoup(html_string, 'lxml') # Faster, needs lxml installed
# From file
with open('page.html', 'r') as f:
soup = BeautifulSoup(f, 'html.parser')
Finding Elements
# Find single element
soup.find('div') # First div
soup.find('div', class_='main') # First div with class="main"
soup.find('div', {'id': 'content'}) # By attribute dict
soup.find(id='content') # By keyword argument
soup.find('a', href=True) # Any <a> with href attribute
# Find all elements
soup.find_all('p') # All <p> tags
soup.find_all(['h1', 'h2', 'h3']) # Any heading
soup.find_all('div', class_='item') # All divs with class="item"
soup.find_all('a', limit=5) # First 5 links
# CSS selectors (often easier)
soup.select('div.main') # div with class "main"
soup.select('#content') # Element with id="content"
soup.select('div > p') # p directly inside div
soup.select('div p') # Any p inside div
soup.select('a[href]') # Links with href
soup.select_one('div.main') # First match only
Navigating the Tree
element = soup.find('div')
# Children (direct descendants only)
element.children # Generator of direct children
list(element.children) # Convert to list
element.contents # List of children (includes text nodes)
# Descendants (all nested elements)
element.descendants # Generator of all nested elements
# Parents
element.parent # Direct parent
element.parents # Generator of all parents up to root
# Siblings
element.next_sibling # Next sibling (may be text node)
element.previous_sibling # Previous sibling
element.next_siblings # Generator of all following siblings
element.previous_siblings # Generator of all preceding siblings
Extracting Data
element = soup.find('a')
# Text content
element.get_text() # All text inside element
element.get_text(strip=True) # Strip whitespace
element.get_text(separator=' ') # Join text with separator
element.string # Text if element has only one child
# Attributes
element['href'] # Get href attribute (raises KeyError if missing)
element.get('href') # Get href or None
element.get('href', 'default') # Get href or default value
element.attrs # Dict of all attributes
element.name # Tag name ('a', 'div', etc.)
# Check element type
hasattr(element, 'name') # True if element, False if text node
element.name == 'a' # Check tag type
Common Patterns
# Get all links
for link in soup.find_all('a', href=True):
url = link['href']
text = link.get_text(strip=True)
# Extract table
table = soup.find('table')
rows = []
for tr in table.find_all('tr'):
cells = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
rows.append(cells)
# Find elements with specific text
for element in soup.find_all('p'):
if 'search term' in element.get_text():
process(element)
# Navigate parent chain
element = soup.find('span', class_='target')
section = element.find_parent('section')
heading = section.find('h2')
Important Gotchas
# Text nodes don't have .name
for child in element.children:
if hasattr(child, 'name'): # It's an element
process(child)
else: # It's a text node
pass
# .string only works if element has single text child
div.string # May be None even if div contains text
# Class is a reserved word in Python
element.find(class_='main') # ✓ Use class_ with underscore
element.find({'class': 'main'}) # ✓ Or use dict
# Multiple classes
element.find(class_=['main', 'active']) # Either class
Tree-sitter
Setup
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
# Create language and parser
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
# Parse code (must be bytes)
source_code = "def hello(): pass"
tree = parser.parse(bytes(source_code, 'utf8'))
# Get root node
root = tree.root_node
Node Properties
node = tree.root_node
# Type and identity
node.type # String: 'module', 'function_definition', etc.
node.is_named # True for named nodes, False for syntax (like ':')
# Position in source
node.start_byte # Start position in bytes
node.end_byte # End position in bytes
node.start_point # (row, column) tuple
node.end_point # (row, column) tuple
# Source code
node.text # Source code as bytes
node.text.decode('utf8') # Decode to string
# Tree structure
node.children # List of child nodes
node.child_count # Number of children
node.parent # Parent node (or None for root)
Accessing Children
# Direct access (recommended for most cases)
node.children # List of all children
node.children[0] # First child
node.named_children # List of named children only (skips syntax tokens)
# By field name (semantic access)
func_node = root.child_by_field_name('name') # Get specific field
params = root.child_by_field_name('parameters')
# By type
def find_by_type(node, target_type):
if node.type == target_type:
return node
for child in node.children:
result = find_by_type(child, target_type)
if result:
return result
TreeCursor (Advanced)
# Create cursor
cursor = node.walk()
# Manual navigation
cursor.goto_first_child() # Move to first child (returns False if no children)
cursor.goto_next_sibling() # Move to next sibling (returns False if none)
cursor.goto_parent() # Move to parent
cursor.node # Current node at cursor position
# Typical pattern
cursor = root.walk()
if cursor.goto_first_child():
process(cursor.node)
while cursor.goto_next_sibling():
process(cursor.node)
cursor.goto_parent()
When to use TreeCursor:
- Extremely large files (memory optimization)
- Need precise control over traversal order
- Building custom traversal algorithms
When NOT to use TreeCursor:
- Normal traversal (use
.childreninstead) - Don't mix:
cursor.node.childrendefeats the purpose
Common Node Types (Python)
# Module level
'module' # Root node
'import_statement' # import os
'import_from_statement' # from os import path
'function_definition' # def func():
'class_definition' # class MyClass:
# Inside functions
'expression_statement' # Any expression as statement
'call' # Function call
'return_statement' # return value
'if_statement' # if/elif/else
'for_statement' # for loop
'while_statement' # while loop
# Expressions
'binary_operator' # +, -, *, etc.
'string' # String literal
'integer' # Integer literal
'identifier' # Variable name
'attribute' # obj.attr
'subscript' # list[index]
Field Names (Semantic Access)
# function_definition fields
func_node.child_by_field_name('name') # Function name
func_node.child_by_field_name('parameters') # Parameter list
func_node.child_by_field_name('body') # Function body
func_node.child_by_field_name('return_type') # Return type annotation
# class_definition fields
class_node.child_by_field_name('name') # Class name
class_node.child_by_field_name('superclasses') # Base classes
class_node.child_by_field_name('body') # Class body
# call fields
call_node.child_by_field_name('function') # Function being called
call_node.child_by_field_name('arguments') # Argument list
Common Patterns
# Find all functions
def find_functions(node):
if node.type == 'function_definition':
yield node
for child in node.children:
yield from find_functions(child)
# Get function names
functions = []
for func in find_functions(root):
name_node = func.child_by_field_name('name')
if name_node:
functions.append(name_node.text.decode('utf8'))
# Extract source code
def get_source(node, source_bytes):
return source_bytes[node.start_byte:node.end_byte].decode('utf8')
# Find by position
def node_at_position(root, line, column):
point = (line, column)
for node in traverse(root):
if node.start_point <= point <= node.end_point:
return node
json and jsonpath-ng
Standard json Module
import json
# Parse JSON
data = json.loads(json_string) # From string
with open('data.json') as f:
data = json.load(f) # From file
# Write JSON
json_string = json.dumps(data) # To string
json_string = json.dumps(data, indent=2) # Pretty print
with open('out.json', 'w') as f:
json.dump(data, f, indent=2) # To file
Direct Access
# Simple access
value = data['key']
value = data['key']['subkey']
value = data['key'][0]['subkey']
# Safe access
value = data.get('key', default_value)
value = data.get('key', {}).get('subkey', default_value)
# Check existence
if 'key' in data:
value = data['key']
jsonpath-ng (for complex queries)
from jsonpath_ng import parse
# Find all matches
jsonpath_expr = parse('$.store.book[*].author')
matches = jsonpath_expr.find(data)
authors = [match.value for match in matches]
# Common expressions
'$.store.book[*]' # All books
'$..author' # All authors (recursive)
'$.store.book[0]' # First book
'$.store.book[-1]' # Last book
'$.store.book[0:2]' # First two books
'$.store.book[?(@.price < 10)]' # Books under $10
# Get paths and values
for match in matches:
print(f"Path: {match.path}")
print(f"Value: {match.value}")
When to use jsonpath-ng:
- Complex queries across unknown structure
- Need to find all instances matching pattern
- Working with APIs that document paths in JSONPath format
When NOT to use jsonpath-ng:
- Simple direct access (use
dict['key']) - Performance critical (direct access is faster)
- Need to modify data (JSONPath is for reading)
Quick Comparison Table
| Task | os.walk() | pathlib | BeautifulSoup | Tree-sitter |
|---|---|---|---|---|
| Find all X | Manual filter | .rglob('*.ext') |
.find_all('tag') |
Recursive search |
| Get children | files list |
.iterdir() |
.children |
.children |
| Get text | Read file | .read_text() |
.get_text() |
.text.decode() |
| Filter | Modify dirs |
Generator expr | .find_all(cond) |
Check .type |
| Memory efficient | Generator | Generator | Returns list | Use TreeCursor |
| Get position | N/A | N/A | N/A | .start_point |
Remember
- Try the simple thing first: Direct access before search, built-in methods before custom code
- Check the return type: Generator vs list matters for memory and re-iteration
- Handle missing data: Use
.get(),try-except, or explicit checks - Keep source bytes: Tree-sitter needs original bytes for accurate slicing
- Read the docs once: 5 minutes of documentation saves 30 minutes of trial and error