Appendix A: Quick Reference - Common Patterns
File System Traversal
Pattern 1: Find All Files by Extension
import os
def find_files_by_extension(root_path, extension):
"""
Find all files with given extension.
Example: find_files_by_extension('/projects', '.py')
"""
for root, dirs, files in os.walk(root_path):
for file in files:
if file.endswith(extension):
yield os.path.join(root, file)
When to use: You know the extension, need all matching files.
Returns: Generator of full file paths.
Pattern 2: Calculate Directory Size
import os
def calculate_dir_size(root_path):
"""Calculate total size of directory in bytes."""
total_size = 0
for root, dirs, files in os.walk(root_path):
for file in files:
filepath = os.path.join(root, file)
try:
total_size += os.path.getsize(filepath)
except OSError:
# Skip files we can't read
continue
return total_size
When to use: Need aggregate statistics about directory contents.
Note: Always wrap file operations in try-except for robustness.
Pattern 3: Skip Directories During Traversal
import os
def find_files_skip_hidden(root_path):
"""Find files but skip hidden directories and __pycache__."""
for root, dirs, files in os.walk(root_path):
# Modify dirs in-place to skip certain directories
dirs[:] = [d for d in dirs if not d.startswith('.') and d != '__pycache__']
for file in files:
yield os.path.join(root, file)
Key insight: Modifying dirs in-place affects which directories os.walk() visits next.
When to use: Exclude version control directories, build artifacts, etc.
Pattern 4: Build File Tree Structure
import os
from pathlib import Path
def build_tree(root_path, max_depth=None):
"""
Build dictionary representation of directory tree.
Returns: {'name': 'root', 'type': 'dir', 'children': [...]}
"""
def _build(path, depth=0):
path_obj = Path(path)
node = {'name': path_obj.name, 'path': str(path)}
if path_obj.is_file():
node['type'] = 'file'
node['size'] = path_obj.stat().st_size
else:
node['type'] = 'dir'
if max_depth is None or depth < max_depth:
node['children'] = []
try:
for child in path_obj.iterdir():
node['children'].append(_build(child, depth + 1))
except PermissionError:
node['error'] = 'Permission denied'
return node
return _build(root_path)
When to use: Need structured representation of filesystem for display or serialization.
Note: Includes depth limiting to prevent excessive recursion.
JSON Navigation
Pattern 5: Direct Access with Safety
def safe_navigate(data, *keys, default=None):
"""
Safely navigate nested dictionary.
Example: safe_navigate(data, 'user', 'profile', 'name', default='Unknown')
"""
result = data
for key in keys:
if isinstance(result, dict):
result = result.get(key)
if result is None:
return default
else:
return default
return result
When to use: Known path, but data might be incomplete.
Alternative: data.get('user', {}).get('profile', {}).get('name')
Pattern 6: Find All Occurrences of a Key
def find_all_values_for_key(data, target_key):
"""
Find all values associated with a key, anywhere in nested structure.
Example: find_all_values_for_key(api_response, 'email')
Returns: ['user@example.com', 'admin@example.com', ...]
"""
results = []
def _search(obj):
if isinstance(obj, dict):
for key, value in obj.items():
if key == target_key:
results.append(value)
_search(value)
elif isinstance(obj, list):
for item in obj:
_search(item)
_search(data)
return results
When to use: Schema unknown, need to collect all instances of a field.
Common use case: Extracting all email addresses, IDs, or URLs from API response.
Pattern 7: Find Value with Path Tracking
def find_with_path(data, target_key):
"""
Find values and their paths in nested structure.
Returns: [({'path': ['user', 'contacts', 0, 'email']}, 'value'), ...]
"""
results = []
def _search(obj, path=()):
if isinstance(obj, dict):
for key, value in obj.items():
current_path = path + (key,)
if key == target_key:
results.append((list(current_path), value))
_search(value, current_path)
elif isinstance(obj, list):
for idx, item in enumerate(obj):
_search(item, path + (idx,))
_search(data)
return results
When to use: Need to know where in the structure you found the value.
Use case: Debugging, building edit operations, reporting data locations.
Pattern 8: Flatten Nested Dictionary
def flatten_dict(data, separator='.'):
"""
Flatten nested dictionary to single level.
Input: {'a': {'b': {'c': 1}}}
Output: {'a.b.c': 1}
"""
flat = {}
def _flatten(obj, prefix=''):
if isinstance(obj, dict):
for key, value in obj.items():
new_key = f"{prefix}{separator}{key}" if prefix else key
_flatten(value, new_key)
elif isinstance(obj, list):
for idx, item in enumerate(obj):
new_key = f"{prefix}[{idx}]"
_flatten(item, new_key)
else:
flat[prefix] = obj
_flatten(data)
return flat
When to use: Need to convert nested data to flat structure (e.g., for CSV export).
Note: Consider using pandas.json_normalize() for complex cases.
HTML Parsing
Pattern 9: Extract All Links with Context
from bs4 import BeautifulSoup
def extract_links_with_context(html):
"""
Extract links with surrounding text for context.
Returns: [{'url': '...', 'text': '...', 'context': '...'}, ...]
"""
soup = BeautifulSoup(html, 'html.parser')
links = []
for a_tag in soup.find_all('a', href=True):
link_info = {
'url': a_tag['href'],
'text': a_tag.get_text(strip=True),
'context': ''
}
# Get parent paragraph for context
parent = a_tag.find_parent(['p', 'div', 'li'])
if parent:
link_info['context'] = parent.get_text(strip=True)[:100]
links.append(link_info)
return links
When to use: Web scraping where link context matters.
Extension: Track which section/heading the link appears under.
Pattern 10: Extract Table Data
def extract_table(html, table_index=0):
"""
Extract table as list of dictionaries.
Returns: [{'Header1': 'value1', 'Header2': 'value2'}, ...]
"""
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
if table_index >= len(tables):
return []
table = tables[table_index]
headers = [th.get_text(strip=True) for th in table.find_all('th')]
if not headers:
# No header row, use first row
first_row = table.find('tr')
if first_row:
headers = [td.get_text(strip=True) for td in first_row.find_all('td')]
rows = []
for tr in table.find_all('tr')[1:]: # Skip header row
cells = [td.get_text(strip=True) for td in tr.find_all('td')]
if cells:
row_dict = dict(zip(headers, cells))
rows.append(row_dict)
return rows
When to use: Scraping tabular data from web pages.
Note: Handles tables with or without explicit <th> headers.
Pattern 11: Navigate by Section
def extract_by_section(html):
"""
Extract content organized by headings.
Returns: {'Section Name': ['paragraph1', 'paragraph2', ...], ...}
"""
soup = BeautifulSoup(html, 'html.parser')
sections = {}
current_section = 'Introduction'
for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
if element.name in ['h1', 'h2', 'h3']:
current_section = element.get_text(strip=True)
sections[current_section] = []
elif element.name == 'p':
text = element.get_text(strip=True)
if text:
if current_section not in sections:
sections[current_section] = []
sections[current_section].append(text)
return sections
When to use: Need to maintain document structure during extraction.
Common case: Converting articles to structured data.
AST Analysis
Pattern 12: Find All Functions
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
def find_all_functions(source_code):
"""
Find all function definitions with their names and line numbers.
Returns: [{'name': 'func_name', 'line': 10, 'code': '...'}, ...]
"""
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
tree = parser.parse(bytes(source_code, 'utf8'))
functions = []
def _traverse(node):
if node.type == 'function_definition':
name_node = node.child_by_field_name('name')
if name_node:
func_info = {
'name': name_node.text.decode('utf8'),
'line': node.start_point[0] + 1,
'code': node.text.decode('utf8')
}
functions.append(func_info)
for child in node.children:
_traverse(child)
_traverse(tree.root_node)
return functions
When to use: Static analysis, documentation generation, code indexing.
Note: Keep original source as bytes for accurate slicing.
Pattern 13: Extract Function Signatures
def extract_function_signatures(source_code):
"""
Extract function signatures with parameters and return types.
Returns: [{'name': 'func', 'params': ['a', 'b'], 'returns': 'int'}, ...]
"""
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
tree = parser.parse(bytes(source_code, 'utf8'))
signatures = []
def _extract_params(params_node):
params = []
for child in params_node.children:
if child.type == 'identifier':
params.append(child.text.decode('utf8'))
return params
def _traverse(node):
if node.type == 'function_definition':
sig = {}
name_node = node.child_by_field_name('name')
if name_node:
sig['name'] = name_node.text.decode('utf8')
params_node = node.child_by_field_name('parameters')
if params_node:
sig['params'] = _extract_params(params_node)
return_node = node.child_by_field_name('return_type')
if return_node:
sig['returns'] = return_node.text.decode('utf8')
signatures.append(sig)
for child in node.children:
_traverse(child)
_traverse(tree.root_node)
return signatures
When to use: Generating API documentation, type checking, code analysis.
Extension: Track decorators, docstrings, argument types.
Pattern 14: Find All Imports
def find_all_imports(source_code):
"""
Find all import statements.
Returns: {
'standard': ['os', 'sys'],
'third_party': ['requests', 'numpy'],
'local': ['utils', 'models']
}
"""
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
tree = parser.parse(bytes(source_code, 'utf8'))
imports = {'standard': [], 'third_party': [], 'local': []}
def _traverse(node):
if node.type in ['import_statement', 'import_from_statement']:
import_text = node.text.decode('utf8')
# Simple heuristic: would need proper classification
if 'from .' in import_text or 'import .' in import_text:
imports['local'].append(import_text)
else:
imports['third_party'].append(import_text)
for child in node.children:
_traverse(child)
_traverse(tree.root_node)
return imports
When to use: Dependency analysis, refactoring, understanding code structure.
Note: Classification logic is simplified; real tool would use stdlib list.
Pattern 15: Track Scope Context
def analyze_with_scope(source_code):
"""
Analyze code while tracking which function/class each node is in.
Returns: [{'type': 'call', 'name': 'print', 'scope': 'function:main'}, ...]
"""
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
tree = parser.parse(bytes(source_code, 'utf8'))
results = []
def _traverse(node, scope_stack=()):
# Track scope
current_scope = scope_stack
if node.type == 'function_definition':
name_node = node.child_by_field_name('name')
if name_node:
func_name = name_node.text.decode('utf8')
current_scope = scope_stack + (f'function:{func_name}',)
elif node.type == 'class_definition':
name_node = node.child_by_field_name('name')
if name_node:
class_name = name_node.text.decode('utf8')
current_scope = scope_stack + (f'class:{class_name}',)
# Collect interesting nodes
if node.type == 'call':
func_node = node.child_by_field_name('function')
if func_node:
results.append({
'type': 'call',
'name': func_node.text.decode('utf8'),
'scope': '.'.join(current_scope) if current_scope else 'module',
'line': node.start_point[0] + 1
})
# Recurse with updated scope
for child in node.children:
_traverse(child, current_scope)
_traverse(tree.root_node)
return results
When to use: Understanding where operations occur, refactoring analysis.
Pattern: Pass scope information down the recursion.
Usage Notes
Adapting These Patterns:
- Start with the closest match to your problem
- Modify the condition (what you're looking for)
- Adjust the return value (what you collect)
- Add error handling for your specific edge cases
Performance Considerations:
- All generators (using
yield) can be converted to lists withlist() - All recursive functions can blow the stack on deep structures
- Add depth limits if maximum depth is unknown
- Use visited sets for graphs/cyclic structures
Common Modifications:
- Change
yieldtoresults.append()if you need a list - Add
if condition:before collection to filter results - Add counters/statistics alongside collection
- Return early when first match is found (replace
yieldwithreturn)
Remember:
These are starting points, not finished products. Real code needs:
- Logging for debugging
- Type hints for clarity
- Docstrings with examples
- Tests for edge cases
- Error messages that help users