# -*- coding: utf-8 -*-
from common import escape, is_whitespace, unescape
import re
import unittest
class AccountedString(object):
""" Retain full accountability of edits made to a string, by
recording a series of edit actions (either deletions or insertions)."""
def __init__(self, pre, config, encoding='ascii'):
""" initialise with pre as the pre-edit version and an optional
offset indicating a location in a source string. If pre is not a
unicode string, then it will be converted into one, assuming the
specified encoding"""
super(AccountedString, self).__init__()
self._pre = pre if isinstance(pre, unicode) else unicode(pre, encoding)
self.config = config
self.actions = list()
self._build()
self._normalise_whitespace()
def __unicode__(self):
""" create a unicode version of the string with edits applied"""
return unicode(_PostString(self))
def _build(self):
pass
@staticmethod
def _leading_whitespace(start, statuses):
for status in statuses[:start]:
if status != 'd':
return False
return True
def _make_statuses(self):
statuses = ['w' if is_whitespace(char) else 'r' for char in self._pre]
for action in self.actions:
start = action.index
for i in range(len(action)):
if type(action) is Deletion:
statuses[start + i] = 'd'
elif type(action) is Insertion:
status = 'w' if is_whitespace(unicode(action)[i]) else 'i'
statuses[start + i] = status
return statuses
def _normalise_whitespace(self, call_again=True):
statuses = self._make_statuses()
for start, end in self._whitespace_ranges(statuses):
if self._leading_whitespace(start, statuses):
self.register(Deletion(start, length=end - start + 1))
elif self._trailing_whitespace(end, statuses):
self.register(Deletion(start, length=end - start + 1))
elif start != end:
self.register(Deletion(start, length=end - start))
elif start == end and self._pre[start] != ' ':
self.register(Deletion(start, length=1))
self.register(Insertion(start, ' '))
self.actions = sorted(self.actions)
if call_again:
self._normalise_whitespace(False)
@staticmethod
def _trailing_whitespace(end, statuses):
for status in statuses[end + 1:]:
if status != 'd':
return False
return True
@staticmethod
def _whitespace_ranges(statuses):
start = None
for i, status in enumerate(statuses):
if start is None and status == 'w':
start = i
elif start is not None and status != 'w':
end = i - 1
yield start, end
start = None
if start is not None:
end = len(statuses) - 1
yield start, end
def register(self, action):
""" register an action to apply to the string """
assert isinstance(action, Action)
self.actions.append(action)
@staticmethod
def revert(post_string, actions):
pre_string = post_string
for action in sorted(actions):
pre_string = action.revert(pre_string)
return pre_string
class _PostString(object):
""" creates a post-edit version of an accounted string """
def __init__(self, account):
super(_PostString, self).__init__()
# store the pre-edit version so actions may refer to it
self._pre = account._pre
if len(account.actions) == 0:
# if there are no actions then just use the pre-edit version
self._string = account._pre
else:
# _i tracks the location in the pre-edit version
self._i = 0
# _string holds the constructed post-edit version
self._string = u''
action_queue = sorted(account.actions)
nextAction = action_queue.pop(0)
while self._i < len(self._pre):
if self._i == nextAction.index:
nextAction.execute(self)
if len(action_queue) == 0:
# done with edits - just copy the
# rest of the pre-edit version
self._string += self._pre[self._i:]
break
else:
nextAction = action_queue.pop(0)
else:
# no action to take here - copy the current
# character, and move _i on to the next
self._string += self._pre[self._i]
self._i += 1
def __unicode__(self):
return self._string.replace('\n', ' ')
class Action(object):
""" an abstract class for deletions and insertions """
FORMAT = u'@{index}{symbol}"{content}"'
PATTERN = re.compile(r'@(\d+)([\+\-])"(.+)"$', re.UNICODE)
def __init__(self, index, content=u''):
super(Action, self).__init__()
self.index = index
self.content = content
def __cmp__(self, other):
""" order actions by index---if the indices are the same then
order insertions first """
if self.index < other.index:
return -1
elif self.index > other.index:
return 1
else:
if type(self) == Insertion:
return -1
else:
return 1
def __len__(self):
return len(self.content)
def __unicode__(self):
return self.content
@staticmethod
def _get_action_type(symbol):
for action_type in Action.__subclasses__():
if symbol == action_type.SYMBOL:
return action_type
return None
def execute(self, post_string):
raise NotImplementedError
@staticmethod
def fromstring(string):
match = Action.PATTERN.match(string)
index = int(match.group(1))
action_type = Action._get_action_type(match.group(2))
content = unescape(match.group(3))
return action_type(index, content)
def revert(self, post_string):
raise NotImplementedError
def tostring(self):
return Action.FORMAT.format(index=self.index,
symbol=self.SYMBOL,
content=escape(self.content))
class Deletion(Action):
SYMBOL = u'-'
def __init__(self, index, content=None, length=None):
assert not (content is None and length is None)
super(Deletion, self).__init__(index)
if content is not None:
self.content = content
else:
# create a substring of wildcards because we don't
# know the actual substring yet
self.content = ' ' * length
def execute(self, post):
# start the deletion at the current index
start = post._i
# move the index tracker to just beyond the deletion,
# so that characters skipped will not be copied across
# by the _PostString constructor
post._i += len(self)
# record what was deleted so that it can be reverted later
self.content = post._pre[start:post._i]
def revert(self, post):
start = self.index
return post[:start] + self.content + post[start:]
class Insertion(Action):
SYMBOL = u'+'
def execute(self, post):
# copy the insertion to the _PostString in progress
post._string += unicode(self)
def revert(self, post):
start = self.index
end = start + len(self)
return post[:start] + post[end:]
class _TestAccounting(unittest.TestCase):
simple = u'The quick fox.'
def assertReverted(self, accounted_string):
reverted = unicode(accounted_string)
for action_type in (Insertion, Deletion):
for action in accounted_string.actions:
if type(action) == action_type:
reverted = action.revert(reverted)
self.assertEqual(accounted_string._pre, reverted)
def test_unmodified(self):
test = AccountedString(self.simple, None)
self.assertEqual(self.simple, unicode(test))
def test_insert(self):
test = AccountedString(self.simple, None)
test.register(Insertion(17, 'brown '))
self.assertEqual(u'The quick brown fox.', unicode(test))
self.assertReverted(test)
def test_deletion(self):
test = AccountedString(u'The quick fox.', None)
test.register(Deletion(4, content=''))
test.register(Deletion(12, content=''))
self.assertEqual(u'The quick fox.', unicode(test))
self.assertReverted(test)
def test_mix(self):
test = AccountedString(u'This should do it: sudo reboot
!', None)
test.register(Deletion(19, length=24))
test.register(Insertion(19, '
'))
self.assertEqual(u'This should do it:
!', unicode(test))
self.assertReverted(test)
def test_normalisation(self):
test = AccountedString(u'blah\nblah', None)
test._normalise_whitespace()
self.assertEqual(u'blah blah', unicode(test))