fennol.utils.unit_parser
1from math import pi 2import math 3import re 4import copy 5import copy 6from typing import Dict, Union, Optional 7 8 9def _tokenize_unit_string(unit_string: str) -> list: 10 """ 11 Tokenize a unit string into components. 12 13 Returns a list of tokens where each token is either: 14 - A unit name (string) 15 - An operator ('*', '/', '^') 16 - A power specification (string starting with number or '{' or '(') 17 - Parentheses ('(', ')') 18 """ 19 # Define simpler regex patterns 20 unit_name_pattern = r'[A-Z0-9_-]+' 21 braced_power_pattern = r'\{[^}]*\}' 22 numeric_power_pattern = r'[+-]?\d+(?:\.\d+)?' 23 parenthesized_power_pattern = r'\([^\)]*\)' 24 25 tokens = [] 26 pos = 0 27 28 while pos < len(unit_string): 29 # Skip whitespace 30 while pos < len(unit_string) and unit_string[pos].isspace(): 31 pos += 1 32 33 if pos >= len(unit_string): 34 break 35 36 current_char = unit_string[pos] 37 38 # Handle parentheses 39 if current_char in '()': 40 tokens.append(current_char) 41 pos += 1 42 continue 43 44 # Handle operators 45 if current_char in '*/': 46 tokens.append(current_char) 47 pos += 1 48 continue 49 50 # Handle power operator 51 if current_char == '^': 52 tokens.append(current_char) 53 pos += 1 54 55 # Skip whitespace after ^ 56 while pos < len(unit_string) and unit_string[pos].isspace(): 57 pos += 1 58 59 if pos >= len(unit_string): 60 raise ValueError("Power operator '^' not followed by power value") 61 62 # Look for braced power: {2}, {-3}, {2.5} 63 braced_match = re.match(braced_power_pattern, unit_string[pos:]) 64 if braced_match: 65 tokens.append(braced_match.group(0)[1:-1].strip()) # Remove braces 66 pos += braced_match.end() 67 continue 68 69 # Look for parenthesized power: (2), (-3), (2.5) 70 parenthesized_match = re.match(parenthesized_power_pattern, unit_string[pos:]) 71 if parenthesized_match: 72 tokens.append(parenthesized_match.group(0)[1:-1].strip()) 73 pos += parenthesized_match.end() 74 continue 75 76 # Look for numeric power: 2, -3, 2.5 77 numeric_match = re.match(numeric_power_pattern, unit_string[pos:]) 78 if numeric_match: 79 tokens.append(numeric_match.group(0).strip()) 80 pos += numeric_match.end() 81 continue 82 83 # No valid power found 84 raise ValueError(f"Invalid power specification starting at position {pos}") 85 86 # Handle unit names 87 unit_match = re.match(unit_name_pattern, unit_string[pos:]) 88 if unit_match: 89 tokens.append(unit_match.group(0)) 90 pos += unit_match.end() 91 continue 92 93 # If we get here, we have an invalid character 94 raise ValueError(f"Invalid character '{current_char}' at position {pos} in unit string '{unit_string}'") 95 96 return tokens 97 98 99def _validate_syntax(tokens: list) -> None: 100 """ 101 Validate the syntax of tokenized unit string. 102 """ 103 if not tokens: 104 raise ValueError("Empty unit string") 105 106 # Check for invalid starting/ending tokens 107 if tokens[0] in ['*', '/', '^']: 108 raise ValueError(f"Unit string cannot start with operator '{tokens[0]}'") 109 if tokens[-1] in ['*', '/', '^']: 110 raise ValueError(f"Unit string cannot end with operator '{tokens[-1]}'") 111 112 # Check for balanced parentheses 113 paren_count = 0 114 for i, token in enumerate(tokens): 115 if token == '(': 116 paren_count += 1 117 elif token == ')': 118 paren_count -= 1 119 if paren_count < 0: 120 raise ValueError(f"Unmatched closing parenthesis at position {i}") 121 122 if paren_count > 0: 123 raise ValueError("Unmatched opening parenthesis") 124 125 # Check for invalid consecutive operators 126 for i in range(len(tokens) - 1): 127 current, next_token = tokens[i], tokens[i + 1] 128 129 # Check for consecutive operators (except ^ followed by power) 130 if current in ['*', '/'] and next_token in ['*', '/', '^']: 131 raise ValueError(f"Invalid consecutive operators: '{current}' followed by '{next_token}'") 132 133 # Check for ^ not followed by power 134 if current == '^' and not (next_token.startswith('{') or next_token.replace('.', '').replace('-', '').replace('+', '').isdigit()): 135 raise ValueError(f"Power operator '^' must be followed by a number, got '{next_token}'") 136 137 # Check for empty parentheses 138 if current == '(' and next_token == ')': 139 raise ValueError("Empty parentheses are not allowed") 140 141 # Check for invalid parentheses placement 142 if current == '(' and next_token in ['*', '/', '^']: 143 raise ValueError(f"Opening parenthesis cannot be followed by operator '{next_token}'") 144 if current in ['*', '/', '^'] and next_token == ')': 145 raise ValueError(f"Operator '{current}' cannot be followed by closing parenthesis") 146 147 148def _find_matching_paren(tokens: list, start_pos: int) -> int: 149 """ 150 Find the position of the matching closing parenthesis for the opening parenthesis at start_pos. 151 """ 152 if tokens[start_pos] != '(': 153 raise ValueError("Expected opening parenthesis") 154 155 paren_count = 1 156 pos = start_pos + 1 157 158 while pos < len(tokens) and paren_count > 0: 159 if tokens[pos] == '(': 160 paren_count += 1 161 elif tokens[pos] == ')': 162 paren_count -= 1 163 pos += 1 164 165 if paren_count > 0: 166 raise ValueError("Unmatched opening parenthesis") 167 168 return pos - 1 # Return position of closing parenthesis 169 170 171def _parse_expression(tokens: list, unit_dict: Dict[str, float]) -> float: 172 """ 173 Parse a list of tokens and return the multiplier. 174 This is a recursive descent parser that handles parentheses. 175 176 Args: 177 tokens: List of tokens to parse 178 unit_dict: Dictionary mapping unit names to their multipliers 179 180 Returns: 181 float: The calculated multiplier 182 """ 183 if not tokens: 184 raise ValueError("Empty expression") 185 186 # Handle parentheses by recursively parsing sub-expressions 187 processed_tokens = [] 188 i = 0 189 190 while i < len(tokens): 191 token = tokens[i] 192 193 if token == '(': 194 # Find matching closing parenthesis 195 close_pos = _find_matching_paren(tokens, i) 196 197 # Recursively parse the sub-expression inside parentheses 198 sub_tokens = tokens[i+1:close_pos] 199 sub_result = _parse_expression(sub_tokens, unit_dict) 200 201 # Replace the parenthesized expression with its result 202 # Create a synthetic unit name to represent the result 203 synthetic_unit = f"__RESULT_{len(processed_tokens)}__" 204 unit_dict[synthetic_unit] = sub_result 205 processed_tokens.append(synthetic_unit) 206 207 i = close_pos + 1 208 continue 209 210 elif token == ')': 211 # This should not happen if validation passed 212 raise ValueError("Unexpected closing parenthesis") 213 214 else: 215 processed_tokens.append(token) 216 i += 1 217 218 # Now parse the processed tokens without parentheses 219 return _parse_linear_expression(processed_tokens, unit_dict) 220 221 222def _parse_linear_expression(tokens: list, unit_dict: Dict[str, float]) -> float: 223 """ 224 Parse a linear expression (no parentheses) and return the multiplier. 225 226 Args: 227 tokens: List of tokens to parse 228 unit_dict: Dictionary mapping unit names to their multipliers 229 230 Returns: 231 float: The calculated multiplier 232 """ 233 multiplier = 1.0 234 current_sign = 1.0 # 1.0 for multiplication, -1.0 for division 235 236 i = 0 237 while i < len(tokens): 238 token = tokens[i] 239 240 if token in ['*', '/']: 241 # Set sign for next unit 242 current_sign = 1.0 if token == '*' else -1.0 243 i += 1 244 continue 245 246 elif token == '^': 247 # This should not happen if validation passed 248 raise ValueError("Unexpected '^' token") 249 250 else: 251 # This should be a unit name 252 unit_name = token 253 power = current_sign # Default power is 1.0 or -1.0 based on sign 254 255 # Check if next token is a power specification 256 if i + 1 < len(tokens) and tokens[i + 1] == '^': 257 if i + 2 >= len(tokens): 258 raise ValueError("Power operator '^' not followed by power value") 259 260 power_spec = tokens[i + 2] 261 try: 262 power_value = float(power_spec) 263 except ValueError: 264 raise ValueError(f"Invalid power specification: '{power_spec}'") 265 power = current_sign * power_value 266 i += 2 # Skip the '^' and power tokens 267 268 # Look up unit multiplier 269 if unit_name not in unit_dict: 270 raise ValueError(f"Unknown unit: {unit_name}") 271 272 unit_multiplier = unit_dict[unit_name] 273 multiplier *= unit_multiplier ** power 274 275 # Reset sign for next unit 276 current_sign = 1.0 277 278 i += 1 279 280 return multiplier 281 282 283def parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float: 284 """ 285 Parse a unit string and return the conversion multiplier. 286 287 This is the main entry point for unit string parsing. It handles the complete 288 parsing pipeline: tokenization, validation, and calculation. 289 290 Args: 291 unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2") 292 unit_dict: Dictionary mapping unit names to their multipliers 293 294 Returns: 295 float: The calculated multiplier 296 297 Raises: 298 ValueError: If the unit string is invalid or contains unknown units 299 300 Example: 301 >>> unit_dict = {"EV": 27.211, "ANGSTROM": 0.529} 302 >>> parse_unit_string("EV*ANGSTROM^2", unit_dict) 303 7.619964 304 """ 305 unit_string = unit_string.upper().strip() 306 307 if not unit_string: 308 raise ValueError("Empty unit string") 309 310 # Create a deep copy of the unit dictionary to avoid modification 311 unit_dict_copy = copy.deepcopy(unit_dict) 312 313 # Tokenize the unit string 314 try: 315 tokens = _tokenize_unit_string(unit_string) 316 except ValueError as e: 317 raise ValueError(f"Syntax error in unit '{unit_string}': {str(e)}") 318 319 # Validate syntax 320 _validate_syntax(tokens) 321 322 # Parse tokens to calculate multiplier using recursive descent parser 323 try: 324 return _parse_expression(tokens, unit_dict_copy) 325 except ValueError as e: 326 raise ValueError(f"Error parsing unit '{unit_string}': {str(e)}")
def
parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float:
284def parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float: 285 """ 286 Parse a unit string and return the conversion multiplier. 287 288 This is the main entry point for unit string parsing. It handles the complete 289 parsing pipeline: tokenization, validation, and calculation. 290 291 Args: 292 unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2") 293 unit_dict: Dictionary mapping unit names to their multipliers 294 295 Returns: 296 float: The calculated multiplier 297 298 Raises: 299 ValueError: If the unit string is invalid or contains unknown units 300 301 Example: 302 >>> unit_dict = {"EV": 27.211, "ANGSTROM": 0.529} 303 >>> parse_unit_string("EV*ANGSTROM^2", unit_dict) 304 7.619964 305 """ 306 unit_string = unit_string.upper().strip() 307 308 if not unit_string: 309 raise ValueError("Empty unit string") 310 311 # Create a deep copy of the unit dictionary to avoid modification 312 unit_dict_copy = copy.deepcopy(unit_dict) 313 314 # Tokenize the unit string 315 try: 316 tokens = _tokenize_unit_string(unit_string) 317 except ValueError as e: 318 raise ValueError(f"Syntax error in unit '{unit_string}': {str(e)}") 319 320 # Validate syntax 321 _validate_syntax(tokens) 322 323 # Parse tokens to calculate multiplier using recursive descent parser 324 try: 325 return _parse_expression(tokens, unit_dict_copy) 326 except ValueError as e: 327 raise ValueError(f"Error parsing unit '{unit_string}': {str(e)}")
Parse a unit string and return the conversion multiplier.
This is the main entry point for unit string parsing. It handles the complete parsing pipeline: tokenization, validation, and calculation.
Args: unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2") unit_dict: Dictionary mapping unit names to their multipliers
Returns: float: The calculated multiplier
Raises: ValueError: If the unit string is invalid or contains unknown units
Example:
unit_dict = {"EV": 27.211, "ANGSTROM": 0.529} parse_unit_string("EV*ANGSTROM^2", unit_dict) 7.619964