fennol.utils.unit_parser

View Source

  1from math import pi
  2import math
  3import re
  4import copy
  5import copy
  6from typing import Dict, Union, Optional
  7
  8
  9def _tokenize_unit_string(unit_string: str) -> list:
 10    """
 11    Tokenize a unit string into components.
 12    
 13    Returns a list of tokens where each token is either:
 14    - A unit name (string)
 15    - An operator ('*', '/', '^')
 16    - A power specification (string starting with number or '{' or '(')
 17    - Parentheses ('(', ')')
 18    """
 19    # Define simpler regex patterns
 20    unit_name_pattern = r'[A-Z0-9_-]+'
 21    braced_power_pattern = r'\{[^}]*\}'
 22    numeric_power_pattern = r'[+-]?\d+(?:\.\d+)?'
 23    parenthesized_power_pattern = r'\([^\)]*\)'
 24    
 25    tokens = []
 26    pos = 0
 27    
 28    while pos < len(unit_string):
 29        # Skip whitespace
 30        while pos < len(unit_string) and unit_string[pos].isspace():
 31            pos += 1
 32        
 33        if pos >= len(unit_string):
 34            break
 35        
 36        current_char = unit_string[pos]
 37        
 38        # Handle parentheses
 39        if current_char in '()':
 40            tokens.append(current_char)
 41            pos += 1
 42            continue
 43        
 44        # Handle operators
 45        if current_char in '*/':
 46            tokens.append(current_char)
 47            pos += 1
 48            continue
 49        
 50        # Handle power operator
 51        if current_char == '^':
 52            tokens.append(current_char)
 53            pos += 1
 54            
 55            # Skip whitespace after ^
 56            while pos < len(unit_string) and unit_string[pos].isspace():
 57                pos += 1
 58            
 59            if pos >= len(unit_string):
 60                raise ValueError("Power operator '^' not followed by power value")
 61            
 62            # Look for braced power: {2}, {-3}, {2.5}
 63            braced_match = re.match(braced_power_pattern, unit_string[pos:])
 64            if braced_match:
 65                tokens.append(braced_match.group(0)[1:-1].strip())  # Remove braces
 66                pos += braced_match.end()
 67                continue
 68
 69            # Look for parenthesized power: (2), (-3), (2.5)
 70            parenthesized_match = re.match(parenthesized_power_pattern, unit_string[pos:])
 71            if parenthesized_match:
 72                tokens.append(parenthesized_match.group(0)[1:-1].strip())
 73                pos += parenthesized_match.end()
 74                continue
 75            
 76            # Look for numeric power: 2, -3, 2.5
 77            numeric_match = re.match(numeric_power_pattern, unit_string[pos:])
 78            if numeric_match:
 79                tokens.append(numeric_match.group(0).strip())
 80                pos += numeric_match.end()
 81                continue
 82            
 83            # No valid power found
 84            raise ValueError(f"Invalid power specification starting at position {pos}")
 85        
 86        # Handle unit names
 87        unit_match = re.match(unit_name_pattern, unit_string[pos:])
 88        if unit_match:
 89            tokens.append(unit_match.group(0))
 90            pos += unit_match.end()
 91            continue
 92        
 93        # If we get here, we have an invalid character
 94        raise ValueError(f"Invalid character '{current_char}' at position {pos} in unit string '{unit_string}'")
 95    
 96    return tokens
 97
 98
 99def _validate_syntax(tokens: list) -> None:
100    """
101    Validate the syntax of tokenized unit string.
102    """
103    if not tokens:
104        raise ValueError("Empty unit string")
105    
106    # Check for invalid starting/ending tokens
107    if tokens[0] in ['*', '/', '^']:
108        raise ValueError(f"Unit string cannot start with operator '{tokens[0]}'")
109    if tokens[-1] in ['*', '/', '^']:
110        raise ValueError(f"Unit string cannot end with operator '{tokens[-1]}'")
111    
112    # Check for balanced parentheses
113    paren_count = 0
114    for i, token in enumerate(tokens):
115        if token == '(':
116            paren_count += 1
117        elif token == ')':
118            paren_count -= 1
119            if paren_count < 0:
120                raise ValueError(f"Unmatched closing parenthesis at position {i}")
121    
122    if paren_count > 0:
123        raise ValueError("Unmatched opening parenthesis")
124    
125    # Check for invalid consecutive operators
126    for i in range(len(tokens) - 1):
127        current, next_token = tokens[i], tokens[i + 1]
128        
129        # Check for consecutive operators (except ^ followed by power)
130        if current in ['*', '/'] and next_token in ['*', '/', '^']:
131            raise ValueError(f"Invalid consecutive operators: '{current}' followed by '{next_token}'")
132        
133        # Check for ^ not followed by power
134        if current == '^' and not (next_token.startswith('{') or next_token.replace('.', '').replace('-', '').replace('+', '').isdigit()):
135            raise ValueError(f"Power operator '^' must be followed by a number, got '{next_token}'")
136        
137        # Check for empty parentheses
138        if current == '(' and next_token == ')':
139            raise ValueError("Empty parentheses are not allowed")
140        
141        # Check for invalid parentheses placement
142        if current == '(' and next_token in ['*', '/', '^']:
143            raise ValueError(f"Opening parenthesis cannot be followed by operator '{next_token}'")
144        if current in ['*', '/', '^'] and next_token == ')':
145            raise ValueError(f"Operator '{current}' cannot be followed by closing parenthesis")
146
147
148def _find_matching_paren(tokens: list, start_pos: int) -> int:
149    """
150    Find the position of the matching closing parenthesis for the opening parenthesis at start_pos.
151    """
152    if tokens[start_pos] != '(':
153        raise ValueError("Expected opening parenthesis")
154    
155    paren_count = 1
156    pos = start_pos + 1
157    
158    while pos < len(tokens) and paren_count > 0:
159        if tokens[pos] == '(':
160            paren_count += 1
161        elif tokens[pos] == ')':
162            paren_count -= 1
163        pos += 1
164    
165    if paren_count > 0:
166        raise ValueError("Unmatched opening parenthesis")
167    
168    return pos - 1  # Return position of closing parenthesis
169
170
171def _parse_expression(tokens: list, unit_dict: Dict[str, float]) -> float:
172    """
173    Parse a list of tokens and return the multiplier.
174    This is a recursive descent parser that handles parentheses.
175    
176    Args:
177        tokens: List of tokens to parse
178        unit_dict: Dictionary mapping unit names to their multipliers
179    
180    Returns:
181        float: The calculated multiplier
182    """
183    if not tokens:
184        raise ValueError("Empty expression")
185    
186    # Handle parentheses by recursively parsing sub-expressions
187    processed_tokens = []
188    i = 0
189    
190    while i < len(tokens):
191        token = tokens[i]
192        
193        if token == '(':
194            # Find matching closing parenthesis
195            close_pos = _find_matching_paren(tokens, i)
196            
197            # Recursively parse the sub-expression inside parentheses
198            sub_tokens = tokens[i+1:close_pos]
199            sub_result = _parse_expression(sub_tokens, unit_dict)
200            
201            # Replace the parenthesized expression with its result
202            # Create a synthetic unit name to represent the result
203            synthetic_unit = f"__RESULT_{len(processed_tokens)}__"
204            unit_dict[synthetic_unit] = sub_result
205            processed_tokens.append(synthetic_unit)
206            
207            i = close_pos + 1
208            continue
209        
210        elif token == ')':
211            # This should not happen if validation passed
212            raise ValueError("Unexpected closing parenthesis")
213        
214        else:
215            processed_tokens.append(token)
216            i += 1
217    
218    # Now parse the processed tokens without parentheses
219    return _parse_linear_expression(processed_tokens, unit_dict)
220
221
222def _parse_linear_expression(tokens: list, unit_dict: Dict[str, float]) -> float:
223    """
224    Parse a linear expression (no parentheses) and return the multiplier.
225    
226    Args:
227        tokens: List of tokens to parse
228        unit_dict: Dictionary mapping unit names to their multipliers
229    
230    Returns:
231        float: The calculated multiplier
232    """
233    multiplier = 1.0
234    current_sign = 1.0  # 1.0 for multiplication, -1.0 for division
235    
236    i = 0
237    while i < len(tokens):
238        token = tokens[i]
239        
240        if token in ['*', '/']:
241            # Set sign for next unit
242            current_sign = 1.0 if token == '*' else -1.0
243            i += 1
244            continue
245        
246        elif token == '^':
247            # This should not happen if validation passed
248            raise ValueError("Unexpected '^' token")
249        
250        else:
251            # This should be a unit name
252            unit_name = token
253            power = current_sign  # Default power is 1.0 or -1.0 based on sign
254            
255            # Check if next token is a power specification
256            if i + 1 < len(tokens) and tokens[i + 1] == '^':
257                if i + 2 >= len(tokens):
258                    raise ValueError("Power operator '^' not followed by power value")
259                
260                power_spec = tokens[i + 2]
261                try:
262                    power_value = float(power_spec)
263                except ValueError:
264                    raise ValueError(f"Invalid power specification: '{power_spec}'")
265                power = current_sign * power_value
266                i += 2  # Skip the '^' and power tokens
267            
268            # Look up unit multiplier
269            if unit_name not in unit_dict:
270                raise ValueError(f"Unknown unit: {unit_name}")
271            
272            unit_multiplier = unit_dict[unit_name]
273            multiplier *= unit_multiplier ** power
274            
275            # Reset sign for next unit
276            current_sign = 1.0
277        
278        i += 1
279    
280    return multiplier
281
282
283def parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float:
284    """
285    Parse a unit string and return the conversion multiplier.
286    
287    This is the main entry point for unit string parsing. It handles the complete
288    parsing pipeline: tokenization, validation, and calculation.
289    
290    Args:
291        unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2")
292        unit_dict: Dictionary mapping unit names to their multipliers
293    
294    Returns:
295        float: The calculated multiplier
296    
297    Raises:
298        ValueError: If the unit string is invalid or contains unknown units
299    
300    Example:
301        >>> unit_dict = {"EV": 27.211, "ANGSTROM": 0.529}
302        >>> parse_unit_string("EV*ANGSTROM^2", unit_dict)
303        7.619964
304    """
305    unit_string = unit_string.upper().strip()
306    
307    if not unit_string:
308        raise ValueError("Empty unit string")
309    
310    # Create a deep copy of the unit dictionary to avoid modification
311    unit_dict_copy = copy.deepcopy(unit_dict)
312    
313    # Tokenize the unit string
314    try:
315        tokens = _tokenize_unit_string(unit_string)
316    except ValueError as e:
317        raise ValueError(f"Syntax error in unit '{unit_string}': {str(e)}")
318    
319    # Validate syntax
320    _validate_syntax(tokens)
321    
322    # Parse tokens to calculate multiplier using recursive descent parser
323    try:
324        return _parse_expression(tokens, unit_dict_copy)
325    except ValueError as e:
326        raise ValueError(f"Error parsing unit '{unit_string}': {str(e)}")

def parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float: View Source

284def parse_unit_string(unit_string: str, unit_dict: Dict[str, float]) -> float:
285    """
286    Parse a unit string and return the conversion multiplier.
287    
288    This is the main entry point for unit string parsing. It handles the complete
289    parsing pipeline: tokenization, validation, and calculation.
290    
291    Args:
292        unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2")
293        unit_dict: Dictionary mapping unit names to their multipliers
294    
295    Returns:
296        float: The calculated multiplier
297    
298    Raises:
299        ValueError: If the unit string is invalid or contains unknown units
300    
301    Example:
302        >>> unit_dict = {"EV": 27.211, "ANGSTROM": 0.529}
303        >>> parse_unit_string("EV*ANGSTROM^2", unit_dict)
304        7.619964
305    """
306    unit_string = unit_string.upper().strip()
307    
308    if not unit_string:
309        raise ValueError("Empty unit string")
310    
311    # Create a deep copy of the unit dictionary to avoid modification
312    unit_dict_copy = copy.deepcopy(unit_dict)
313    
314    # Tokenize the unit string
315    try:
316        tokens = _tokenize_unit_string(unit_string)
317    except ValueError as e:
318        raise ValueError(f"Syntax error in unit '{unit_string}': {str(e)}")
319    
320    # Validate syntax
321    _validate_syntax(tokens)
322    
323    # Parse tokens to calculate multiplier using recursive descent parser
324    try:
325        return _parse_expression(tokens, unit_dict_copy)
326    except ValueError as e:
327        raise ValueError(f"Error parsing unit '{unit_string}': {str(e)}")

Parse a unit string and return the conversion multiplier.

This is the main entry point for unit string parsing. It handles the complete parsing pipeline: tokenization, validation, and calculation.

Args: unit_string: The unit string to parse (e.g., "EV*ANGSTROM^2") unit_dict: Dictionary mapping unit names to their multipliers

Returns: float: The calculated multiplier

Raises: ValueError: If the unit string is invalid or contains unknown units

Example:

unit_dict = {"EV": 27.211, "ANGSTROM": 0.529} parse_unit_string("EV*ANGSTROM^2", unit_dict) 7.619964