Words Visz
Seeing is believing. Belief generates faith. Faith is power to move and create.
Here is some code snippet that might be helpful to visualize the text words like a tree. I created so that I can see the regex pattern codes used in a sold product.
``
import pandas as pd
from treelib import Node, Tree
from collections import defaultdict
class display_data():
def __init__(self, data=[]):
self.data = pd.Series(data)
self.sam = defaultdict(int)
# create a temporaray fns
def get_first_charecter(item_number):
item_number = str(item_number)
if item_number:
return item_number[0]
return '@'
# PERFORMANCE: SUGGESTION - pass not any self.fns to `map` fns.
self.all_char_families = sorted(set(map(get_first_charecter, self.data)))
# update N-gram Dict
self.update_sam()
def reset_sam(self):
self.sam = defaultdict(int)
def update_sam(self):
'''Parse each code in N-Grams fashion and update `sam` dict.'''
if self.sam:
return
sam = self.sam
data = self.data
def update_code_to_sam(code):
tmp = ''
for char in str(code):
tmp += char
sam[tmp] += 1
for code in data:
update_code_to_sam(code)
def get_char_family_data(self, char):
'''Builds tree data for a Specific Charecter.'''
sam = self.sam
tmp = sorted(list(sam.items()), key=lambda x: x[0])
char_family_data = list(filter(lambda x: x[0].startswith(char), tmp))
return char_family_data
def charecter_show_tree(self, data):
'''Builds tree and displays data.'''
tree = Tree()
for node_name, count in data:
title = node_name + '(' + str(count) + ')'
if len(node_name) == 1:
# Narent Node
tree.create_node(title, node_name)
else:
if count > 10:
tree.create_node(title, node_name, parent=node_name[:-1])
tree.show()
def show_char_family(self, char):
'''Main Tree plot to collect data and show as tree.'''
update_sam = self.update_sam
if not sam:
update_sam()
char_family_data = self.get_char_family_data(char)
print(char, '- Family Datasize is', len(char_family_data))
if char_family_data:
# if data exists
self.charecter_show_tree(char_family_data)
return
TEST DATA VIS
# making 10 copies as 10 is min to show in tree
tmp = display_data(['a', 'aaa', 'a', 'aaa' , 'a',
'aaaa123', 'a123', 'a1233', 'adf13',
'aabbb123', 'ab123', 'abb1233', 'adbbbf13',
'abc', 'abcde', 'abs', 'absol', 'absolute',
'ab', 'abo', 'abov', 'above'
'ab', 'abe', 'abel',
'abraham', 'abe', 'abra'
'c', 'c1', 'c2', 'c3',
'd', 'd1', 'd2', 'd4',
] * 10 )
print(tmp.all_char_families, tmp.show_char_family('a'))
OUTPUT
a - Family Datasize is 58
a(270)
├── a1(20)
│ └── a12(20)
│ └── a123(20)
├── aa(40)
│ └── aaa(30)
├── ab(160)
│ ├── abc(20)
│ ├── abe(30)
│ ├── abo(30)
│ │ └── abov(20)
│ ├── abr(20)
│ │ └── abra(20)
│ └── abs(30)
│ └── abso(20)
│ └── absol(20)
└── ad(20)
['a', 'c', 'd'] None