Python之difflib模块的使用
difflib模块的作用
difflib模块是比较序列,包含一些计算和处理序列之间差异的工具。它对于比较文本是很有用的,其中函数可以使用多种常用的差异格式生成报告。
1、演示的数据
text1 = """Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer eu lacus accumsan arcu fermentum euismod. Donec pulvinar porttitor tellus. Aliquam venenatis. Donec facilisis pharetra tortor. In nec mauris eget magna consequat convalis. Nam sed sem vitae odio pellentesque interdum. Sed consequat viverra nisl. Suspendisse arcu metus, blandit quis, rhoncus ac, pharetra eget, velit. Mauris urna. Morbi nonummy molestie orci. Praesent nisi elit, fringilla ac, suscipit non, tristique vel, mauris. Curabitur vel lorem id nisl porta adipiscing. Suspendisse eu lectus. In nunc. Duis vulputate tristique enim. Donec quis lectus a justo imperdiet tempus.""" text1_lines = text1.splitlines() text2 = """Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer eu lacus accumsan arcu fermentum euismod. Donec pulvinar, porttitor tellus. Aliquam venenatis. Donec facilisis pharetra tortor. In nec mauris eget magna consequat convalis. Nam cras vitae mi vitae odio pellentesque interdum. Sed consequat viverra nisl. Suspendisse arcu metus, blandit quis, rhoncus ac, pharetra eget, velit. Mauris urna. Morbi nonummy molestie orci. Praesent nisi elit, fringilla ac, suscipit non, tristique vel, mauris. Curabitur vel lorem id nisl porta adipiscing. Duis vulputate tristique enim. Donec quis lectus a justo imperdiet tempus. Suspendisse eu lectus. In nunc.""" text2_lines = text2.splitlines()
2、比较文本体
import difflib from difflib_data import * d = difflib.Differ() diff = d.compare(text1_lines, text2_lines) print('\n'.join(diff))
运行效果
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer eu lacus accumsan arcu fermentum euismod. Donec - pulvinar porttitor tellus. Aliquam venenatis. Donec facilisis + pulvinar, porttitor tellus. Aliquam venenatis. Donec facilisis ? + - pharetra tortor. In nec mauris eget magna consequat ? - + pharetra tortor. In nec mauris eget magna consequat - convalis. Nam sed sem vitae odio pellentesque interdum. Sed ? - -- + convalis. Nam cras vitae mi vitae odio pellentesque interdum. Sed ? +++ +++++ + consequat viverra nisl. Suspendisse arcu metus, blandit quis, rhoncus ac, pharetra eget, velit. Mauris urna. Morbi nonummy molestie orci. Praesent nisi elit, fringilla ac, suscipit non, tristique vel, mauris. Curabitur vel lorem id nisl porta - adipiscing. Suspendisse eu lectus. In nunc. Duis vulputate - tristique enim. Donec quis lectus a justo imperdiet tempus. + adipiscing. Duis vulputate tristique enim. Donec quis lectus a + justo imperdiet tempus. Suspendisse eu lectus. In nunc.
3、比较文本体,输出其它格式
import difflib from difflib_data import * diff = difflib.unified_diff( text1_lines, text2_lines, lineterm='', ) print('\n'.join(diff))
运行效果
--- +++ @@ -1,11 +1,11 @@ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer eu lacus accumsan arcu fermentum euismod. Donec -pulvinar porttitor tellus. Aliquam venenatis. Donec facilisis -pharetra tortor. In nec mauris eget magna consequat -convalis. Nam sed sem vitae odio pellentesque interdum. Sed +pulvinar, porttitor tellus. Aliquam venenatis. Donec facilisis +pharetra tortor. In nec mauris eget magna consequat +convalis. Nam cras vitae mi vitae odio pellentesque interdum. Sed consequat viverra nisl. Suspendisse arcu metus, blandit quis, rhoncus ac, pharetra eget, velit. Mauris urna. Morbi nonummy molestie orci. Praesent nisi elit, fringilla ac, suscipit non, tristique vel, mauris. Curabitur vel lorem id nisl porta -adipiscing. Suspendisse eu lectus. In nunc. Duis vulputate -tristique enim. Donec quis lectus a justo imperdiet tempus. +adipiscing. Duis vulputate tristique enim. Donec quis lectus a +justo imperdiet tempus. Suspendisse eu lectus. In nunc.
4、丢弃无用的数据
from difflib import SequenceMatcher def show_results(match): print(' a = {}'.format(match.a)) print(' b = {}'.format(match.b)) print(' size = {}'.format(match.size)) i, j, k = match print(' A[a:a+size] = {!r}'.format(A[i:i + k])) print(' B[b:b+size] = {!r}'.format(B[j:j + k])) A = " abcd" B = "abcd abcd" print('A = {!r}'.format(A)) print('B = {!r}'.format(B)) print('\nWithout junk detection:') s1 = SequenceMatcher(None, A, B) match1 = s1.find_longest_match(0, len(A), 0, len(B)) show_results(match1) print('\nTreat spaces as junk:') s2 = SequenceMatcher(lambda x: x == " ", A, B) match2 = s2.find_longest_match(0, len(A), 0, len(B)) show_results(match2)
运行效果
A = ' abcd' B = 'abcd abcd' Without junk detection: a = 0 b = 4 size = 5 A[a:a+size] = ' abcd' B[b:b+size] = ' abcd' Treat spaces as junk: a = 1 b = 0 size = 4 A[a:a+size] = 'abcd' B[b:b+size] = 'abcd'
5、比较任意类型
import difflib s1 = [1, 2, 3, 5, 6, 4] s2 = [2, 3, 5, 4, 6, 1] print('Initial data:') print('s1 =', s1) print('s2 =', s2) print('s1 == s2:', s1 == s2) print() matcher = difflib.SequenceMatcher(None, s1, s2) for tag, i1, i2, j1, j2 in reversed(matcher.get_opcodes()): if tag == 'delete': print('Remove {} from positions [{}:{}]'.format( s1[i1:i2], i1, i2)) print(' before =', s1) del s1[i1:i2] elif tag == 'equal': print('s1[{}:{}] and s2[{}:{}] are the same'.format( i1, i2, j1, j2)) elif tag == 'insert': print('Insert {} from s2[{}:{}] into s1 at {}'.format( s2[j1:j2], j1, j2, i1)) print(' before =', s1) s1[i1:i2] = s2[j1:j2] elif tag == 'replace': print(('Replace {} from s1[{}:{}] ' 'with {} from s2[{}:{}]').format( s1[i1:i2], i1, i2, s2[j1:j2], j1, j2)) print(' before =', s1) s1[i1:i2] = s2[j1:j2] print(' after =', s1, '\n') print('s1 == s2:', s1 == s2)
运行效果
Initial data: s1 = [1, 2, 3, 5, 6, 4] s2 = [2, 3, 5, 4, 6, 1] s1 == s2: False Replace [4] from s1[5:6] with [1] from s2[5:6] before = [1, 2, 3, 5, 6, 4] after = [1, 2, 3, 5, 6, 1] s1[4:5] and s2[4:5] are the same after = [1, 2, 3, 5, 6, 1] Insert [4] from s2[3:4] into s1 at 4 before = [1, 2, 3, 5, 6, 1] after = [1, 2, 3, 5, 4, 6, 1] s1[1:4] and s2[0:3] are the same after = [1, 2, 3, 5, 4, 6, 1] Remove [1] from positions [0:1] before = [1, 2, 3, 5, 4, 6, 1] after = [2, 3, 5, 4, 6, 1] s1 == s2: True
6、比较输出html格式
import difflib from difflib_data import * d = difflib.HtmlDiff() print(d.make_table(text1_lines, text2_lines))
7、比较输出diff格式
import difflib from difflib_data import * diff = difflib.ndiff(text1_lines, text2_lines) print('\n'.join(diff))
8、比较输出美观分割的格式
import difflib from difflib_data import * diff = difflib.context_diff( text1_lines, text2_lines, lineterm='', ) print('\n'.join(diff))