Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 22  unquote, colorize_ansi 
 23  :group text manipulation: searchall, splitstrip 
 24  :sort: text formatting, text manipulation 
 25   
 26  :type ANSI_STYLES: dict(str) 
 27  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 28   
 29  :type ANSI_COLORS: dict(str) 
 30  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 31   
 32  :type ANSI_PREFIX: str 
 33  :var ANSI_PREFIX: 
 34    ANSI terminal code notifying the start of an ANSI escape sequence 
 35   
 36  :type ANSI_END: str 
 37  :var ANSI_END: 
 38    ANSI terminal code notifying the end of an ANSI escape sequence 
 39   
 40  :type ANSI_RESET: str 
 41  :var ANSI_RESET: 
 42    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 43  """ 
 44  __docformat__ = "restructuredtext en" 
 45   
 46  import sys 
 47  import re 
 48  import os.path as osp 
 49  from warnings import warn 
 50  from unicodedata import normalize as _uninormalize 
 51  try: 
 52      from os import linesep 
 53  except ImportError: 
 54      linesep = '\n' # gae 
 55   
 56  from logilab.common.deprecation import deprecated 
 57   
 58  MANUAL_UNICODE_MAP = { 
 59      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 60      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 61      u'\u2044': u'/',  # FRACTION SLASH 
 62      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 63      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 64      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 65      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 66      u'\xae': u'(r)',  # REGISTERED SIGN 
 67      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 68      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 69      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 70      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 71      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 72      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 73      } 
 74   
75 -def unormalize(ustring, ignorenonascii=None, substitute=None):
76 """replace diacritical characters with their corresponding ascii characters 77 78 Convert the unicode string to its long normalized form (unicode character 79 will be transform into several characters) and keep the first one only. 80 The normal form KD (NFKD) will apply the compatibility decomposition, i.e. 81 replace all compatibility characters with their equivalents. 82 83 :type substitute: str 84 :param substitute: replacement character to use if decomposition fails 85 86 :see: Another project about ASCII transliterations of Unicode text 87 http://pypi.python.org/pypi/Unidecode 88 """ 89 # backward compatibility, ignorenonascii was a boolean 90 if ignorenonascii is not None: 91 warn("ignorenonascii is deprecated, use substitute named parameter instead", 92 DeprecationWarning, stacklevel=2) 93 if ignorenonascii: 94 substitute = '' 95 res = [] 96 for letter in ustring[:]: 97 try: 98 replacement = MANUAL_UNICODE_MAP[letter] 99 except KeyError: 100 replacement = _uninormalize('NFKD', letter)[0] 101 if ord(replacement) >= 2 ** 7: 102 if substitute is None: 103 raise ValueError("can't deal with non-ascii based characters") 104 replacement = substitute 105 res.append(replacement) 106 return u''.join(res)
107
108 -def unquote(string):
109 """remove optional quotes (simple or double) from the string 110 111 :type string: str or unicode 112 :param string: an optionally quoted string 113 114 :rtype: str or unicode 115 :return: the unquoted string (or the input string if it wasn't quoted) 116 """ 117 if not string: 118 return string 119 if string[0] in '"\'': 120 string = string[1:] 121 if string[-1] in '"\'': 122 string = string[:-1] 123 return string
124 125 126 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 127 _NORM_SPACES_RGX = re.compile('\s+') 128
129 -def normalize_text(text, line_len=80, indent='', rest=False):
130 """normalize a text to display it with a maximum line size and 131 optionally arbitrary indentation. Line jumps are normalized but blank 132 lines are kept. The indentation string may be used to insert a 133 comment (#) or a quoting (>) mark for instance. 134 135 :type text: str or unicode 136 :param text: the input text to normalize 137 138 :type line_len: int 139 :param line_len: expected maximum line's length, default to 80 140 141 :type indent: str or unicode 142 :param indent: optional string to use as indentation 143 144 :rtype: str or unicode 145 :return: 146 the input text normalized to fit on lines with a maximized size 147 inferior to `line_len`, and optionally prefixed by an 148 indentation string 149 """ 150 if rest: 151 normp = normalize_rest_paragraph 152 else: 153 normp = normalize_paragraph 154 result = [] 155 for text in _BLANKLINES_RGX.split(text): 156 result.append(normp(text, line_len, indent)) 157 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
158 159
160 -def normalize_paragraph(text, line_len=80, indent=''):
161 """normalize a text to display it with a maximum line size and 162 optionally arbitrary indentation. Line jumps are normalized. The 163 indentation string may be used top insert a comment mark for 164 instance. 165 166 :type text: str or unicode 167 :param text: the input text to normalize 168 169 :type line_len: int 170 :param line_len: expected maximum line's length, default to 80 171 172 :type indent: str or unicode 173 :param indent: optional string to use as indentation 174 175 :rtype: str or unicode 176 :return: 177 the input text normalized to fit on lines with a maximized size 178 inferior to `line_len`, and optionally prefixed by an 179 indentation string 180 """ 181 text = _NORM_SPACES_RGX.sub(' ', text) 182 line_len = line_len - len(indent) 183 lines = [] 184 while text: 185 aline, text = splittext(text.strip(), line_len) 186 lines.append(indent + aline) 187 return linesep.join(lines)
188
189 -def normalize_rest_paragraph(text, line_len=80, indent=''):
190 """normalize a ReST text to display it with a maximum line size and 191 optionally arbitrary indentation. Line jumps are normalized. The 192 indentation string may be used top insert a comment mark for 193 instance. 194 195 :type text: str or unicode 196 :param text: the input text to normalize 197 198 :type line_len: int 199 :param line_len: expected maximum line's length, default to 80 200 201 :type indent: str or unicode 202 :param indent: optional string to use as indentation 203 204 :rtype: str or unicode 205 :return: 206 the input text normalized to fit on lines with a maximized size 207 inferior to `line_len`, and optionally prefixed by an 208 indentation string 209 """ 210 toreport = '' 211 lines = [] 212 line_len = line_len - len(indent) 213 for line in text.splitlines(): 214 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 215 toreport = '' 216 while len(line) > line_len: 217 # too long line, need split 218 line, toreport = splittext(line, line_len) 219 lines.append(indent + line) 220 if toreport: 221 line = toreport + ' ' 222 toreport = '' 223 else: 224 line = '' 225 if line: 226 lines.append(indent + line.strip()) 227 return linesep.join(lines)
228 229
230 -def splittext(text, line_len):
231 """split the given text on space according to the given max line size 232 233 return a 2-uple: 234 * a line <= line_len if possible 235 * the rest of the text which has to be reported on another line 236 """ 237 if len(text) <= line_len: 238 return text, '' 239 pos = min(len(text)-1, line_len) 240 while pos > 0 and text[pos] != ' ': 241 pos -= 1 242 if pos == 0: 243 pos = min(len(text), line_len) 244 while len(text) > pos and text[pos] != ' ': 245 pos += 1 246 return text[:pos], text[pos+1:].strip()
247 248
249 -def splitstrip(string, sep=','):
250 """return a list of stripped string by splitting the string given as 251 argument on `sep` (',' by default). Empty string are discarded. 252 253 >>> splitstrip('a, b, c , 4,,') 254 ['a', 'b', 'c', '4'] 255 >>> splitstrip('a') 256 ['a'] 257 >>> 258 259 :type string: str or unicode 260 :param string: a csv line 261 262 :type sep: str or unicode 263 :param sep: field separator, default to the comma (',') 264 265 :rtype: str or unicode 266 :return: the unquoted string (or the input string if it wasn't quoted) 267 """ 268 return [word.strip() for word in string.split(sep) if word.strip()]
269 270 get_csv = deprecated('get_csv is deprecated, use splitstrip')(splitstrip) 271 272
273 -def split_url_or_path(url_or_path):
274 """return the latest component of a string containing either an url of the 275 form <scheme>://<path> or a local file system path 276 """ 277 if '://' in url_or_path: 278 return url_or_path.rstrip('/').rsplit('/', 1) 279 return osp.split(url_or_path.rstrip(osp.sep))
280 281
282 -def text_to_dict(text):
283 """parse multilines text containing simple 'key=value' lines and return a 284 dict of {'key': 'value'}. When the same key is encountered multiple time, 285 value is turned into a list containing all values. 286 287 >>> d = text_to_dict('''multiple=1 288 ... multiple= 2 289 ... single =3 290 ... ''') 291 >>> d['single'] 292 '3' 293 >>> d['multiple'] 294 ['1', '2'] 295 296 """ 297 res = {} 298 if not text: 299 return res 300 for line in text.splitlines(): 301 line = line.strip() 302 if line and not line.startswith('#'): 303 key, value = [w.strip() for w in line.split('=', 1)] 304 if key in res: 305 try: 306 res[key].append(value) 307 except AttributeError: 308 res[key] = [res[key], value] 309 else: 310 res[key] = value 311 return res
312 313 314 _BLANK_URE = r'(\s|,)+' 315 _BLANK_RE = re.compile(_BLANK_URE) 316 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 317 __UNITS_URE = r'[a-zA-Z]+' 318 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE, __UNITS_URE)) 319 _VALIDATION_RE = re.compile(r'^((%s)(%s))*(%s)?$' % (__VALUE_URE, __UNITS_URE, 320 __VALUE_URE)) 321 322 BYTE_UNITS = { 323 "b": 1, 324 "kb": 1024, 325 "mb": 1024 ** 2, 326 "gb": 1024 ** 3, 327 "tb": 1024 ** 4, 328 } 329 330 TIME_UNITS = { 331 "ms": 0.0001, 332 "s": 1, 333 "min": 60, 334 "h": 60 * 60, 335 "d": 60 * 60 *24, 336 } 337
338 -def apply_units(string, units, inter=None, final=float, blank_reg=_BLANK_RE, 339 value_reg=_VALUE_RE):
340 """Parse the string applying the units defined in units 341 (e.g.: "1.5m",{'m',60} -> 80). 342 343 :type string: str or unicode 344 :param string: the string to parse 345 346 :type units: dict (or any object with __getitem__ using basestring key) 347 :param units: a dict mapping a unit string repr to its value 348 349 :type inter: type 350 :param inter: used to parse every intermediate value (need __sum__) 351 352 :type blank_reg: regexp 353 :param blank_reg: should match every blank char to ignore. 354 355 :type value_reg: regexp with "value" and optional "unit" group 356 :param value_reg: match a value and it's unit into the 357 """ 358 if inter is None: 359 inter = final 360 fstring = _BLANK_RE.sub('', string) 361 if not (fstring and _VALIDATION_RE.match(fstring)): 362 raise ValueError("Invalid unit string: %r." % string) 363 values = [] 364 for match in value_reg.finditer(fstring): 365 dic = match.groupdict() 366 lit, unit = dic["value"], dic.get("unit") 367 value = inter(lit) 368 if unit is not None: 369 try: 370 value *= units[unit.lower()] 371 except KeyError: 372 raise KeyError('invalid unit %s. valid units are %s' % 373 (unit, units.keys())) 374 values.append(value) 375 return final(sum(values))
376 377 378 _LINE_RGX = re.compile('\r\n|\r+|\n') 379
380 -def pretty_match(match, string, underline_char='^'):
381 """return a string with the match location underlined: 382 383 >>> import re 384 >>> print(pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')) 385 il mange du bacon 386 ^^^^^ 387 >>> 388 389 :type match: _sre.SRE_match 390 :param match: object returned by re.match, re.search or re.finditer 391 392 :type string: str or unicode 393 :param string: 394 the string on which the regular expression has been applied to 395 obtain the `match` object 396 397 :type underline_char: str or unicode 398 :param underline_char: 399 character to use to underline the matched section, default to the 400 carret '^' 401 402 :rtype: str or unicode 403 :return: 404 the original string with an inserted line to underline the match 405 location 406 """ 407 start = match.start() 408 end = match.end() 409 string = _LINE_RGX.sub(linesep, string) 410 start_line_pos = string.rfind(linesep, 0, start) 411 if start_line_pos == -1: 412 start_line_pos = 0 413 result = [] 414 else: 415 result = [string[:start_line_pos]] 416 start_line_pos += len(linesep) 417 offset = start - start_line_pos 418 underline = ' ' * offset + underline_char * (end - start) 419 end_line_pos = string.find(linesep, end) 420 if end_line_pos == -1: 421 string = string[start_line_pos:] 422 result.append(string) 423 result.append(underline) 424 else: 425 end = string[end_line_pos + len(linesep):] 426 string = string[start_line_pos:end_line_pos] 427 result.append(string) 428 result.append(underline) 429 result.append(end) 430 return linesep.join(result).rstrip()
431 432 433 # Ansi colorization ########################################################### 434 435 ANSI_PREFIX = '\033[' 436 ANSI_END = 'm' 437 ANSI_RESET = '\033[0m' 438 ANSI_STYLES = { 439 'reset': "0", 440 'bold': "1", 441 'italic': "3", 442 'underline': "4", 443 'blink': "5", 444 'inverse': "7", 445 'strike': "9", 446 } 447 ANSI_COLORS = { 448 'reset': "0", 449 'black': "30", 450 'red': "31", 451 'green': "32", 452 'yellow': "33", 453 'blue': "34", 454 'magenta': "35", 455 'cyan': "36", 456 'white': "37", 457 } 458
459 -def _get_ansi_code(color=None, style=None):
460 """return ansi escape code corresponding to color and style 461 462 :type color: str or None 463 :param color: 464 the color name (see `ANSI_COLORS` for available values) 465 or the color number when 256 colors are available 466 467 :type style: str or None 468 :param style: 469 style string (see `ANSI_COLORS` for available values). To get 470 several style effects at the same time, use a coma as separator. 471 472 :raise KeyError: if an unexistent color or style identifier is given 473 474 :rtype: str 475 :return: the built escape code 476 """ 477 ansi_code = [] 478 if style: 479 style_attrs = splitstrip(style) 480 for effect in style_attrs: 481 ansi_code.append(ANSI_STYLES[effect]) 482 if color: 483 if color.isdigit(): 484 ansi_code.extend(['38', '5']) 485 ansi_code.append(color) 486 else: 487 ansi_code.append(ANSI_COLORS[color]) 488 if ansi_code: 489 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 490 return ''
491
492 -def colorize_ansi(msg, color=None, style=None):
493 """colorize message by wrapping it with ansi escape codes 494 495 :type msg: str or unicode 496 :param msg: the message string to colorize 497 498 :type color: str or None 499 :param color: 500 the color identifier (see `ANSI_COLORS` for available values) 501 502 :type style: str or None 503 :param style: 504 style string (see `ANSI_COLORS` for available values). To get 505 several style effects at the same time, use a coma as separator. 506 507 :raise KeyError: if an unexistent color or style identifier is given 508 509 :rtype: str or unicode 510 :return: the ansi escaped string 511 """ 512 # If both color and style are not defined, then leave the text as is 513 if color is None and style is None: 514 return msg 515 escape_code = _get_ansi_code(color, style) 516 # If invalid (or unknown) color, don't wrap msg with ansi codes 517 if escape_code: 518 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 519 return msg
520 521 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 522
523 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
524 for line in lines: 525 if line[:4] in ('--- ', '+++ '): 526 out.write(colorize_ansi(line, style['separator'])) 527 elif line[0] == '-': 528 out.write(colorize_ansi(line, style['remove'])) 529 elif line[0] == '+': 530 out.write(colorize_ansi(line, style['add'])) 531 elif line[:4] == '--- ': 532 out.write(colorize_ansi(line, style['separator'])) 533 elif line[:4] == '+++ ': 534 out.write(colorize_ansi(line, style['separator'])) 535 else: 536 out.write(line)
537