Tags

, , ,

For Pi Day I published a post on my main blog, Logos Con Carne. Here is the Python code behind the data generated in that post.

The first program generates a frequency count of the digits in pi as well as frequency counts of pairs of digits:

'''\
data[digit-n]:
[   count
,   percentage
,   frequency-data[digit-n]:
        [   count
        ,   percentage
        ]
]

November 2015
'''
from sys import stdin, stdout, stderr, argv
from os import path
from datetime import datetime

BasePath = r'C:\my\path\to\this\stuff'
InputFile  = 'pi.lst'
OutputFile = 'pi-1.out'

def pass_one (data):
    '''Input digit data; count digits.'''
    t0 = datetime.now()
    fn = path.join(BasePath, InputFile)
    fp = open(fn, 'r')
    try:
        last_char = None
        lines = 0
        chars = 0
        # Read pi digits (line by line)...
        for txt in fp:
            lines += 1
            txt = txt.strip()
            for n in txt:
                chars += 1
                n = ord(n) - ord('0')
                # Get the dataset for this digit...
                d = data[n]
                # Bump the digit frequency count...
                d[0] += 1
                # Bump the previous digit frequency count...
                if last_char != None:
                    d[2][last_char][0] += 1
                last_char = n
            if 999999 < lines: break
            except: raise
            finally: fp.close()
    # Return result...
    t1 = datetime.now()
    return {'t0':t0, 't1':t1, 'lines':lines, 'chars':chars}

def pass_two (data, r1):
    '''Calculate frequency count percentages.'''
    t0 = datetime.now()
    cs = float(r1['chars'])
    # Calculate percentages...
    for dx,d in enumerate(data):
        d[1] = float(100) * (float(d[0]) / cs)
    for dx,d in enumerate(data):
        d[1] = float(100) * (float(d[0]) / cs)
        for px,p in enumerate(d[2]):
            p[1] = float(100) * (float(p[0]) / cs)
    # Return result...
    t1 = datetime.now()
    return {'t0':t0, 't1':t1}

def print_data (data, r1, r2):
    '''Print data.'''
    t0 = datetime.now()
    e1 = (r1['t1']-r1['t0']).total_seconds()
    e2 = (r2['t1']-r2['t0']).total_seconds()
    fn = path.join(BasePath, OutputFile)
    fp = open(fn, 'w') try: print >> fp, '%(t0)s' % r1
        print >> fp, '%(lines)d lines, %(chars)d chars' % r1
        print >> fp, 'Pass-1: %7.3f seconds' % e1
        print >> fp, 'Pass-2: %7.3f seconds' % e2
        print >> fp
        # Print digit frequency table...
        for dx,d in enumerate(data):
            s = '%d: %8d (%9.6f, %+.6f)'
            t = (dx, d[0], d[1], float(10)-d[1])
            print >> fp, s % t
        print >> fp
        print >> fp
        print >> fp
        # Print previous digit frequency tables...
        for dx,d in enumerate(data):
            s = '%d: %8d (%9.6f)'
            t = (dx, d[0], d[1])
            print >> fp, s % t
            for px,p in enumerate(d[2]):
                s = '%d_%d: %8d (%.6f, %+.6f)'
                t = (dx, px, p[0], p[1], float(1)-p[1])
                print >> fp, s % t
            print >> fp
        print >> fp
    except:
        raise
    finally:
        fp.close()
    t1 = datetime.now()
    return {'t0':t0, 't1':t1}

Data = [[0,0,[[0,0] for px in range(10)]] for dx in range(10)]
# Do the thing...
result1 = pass_one(Data)
result2 = pass_two(Data, result1)
# Emit the result...
result3 = print_data(Data, result1, result2)

'''eof'''

The second program looks for repeating sequences of digits:

'''\
data[digit-n]:
[   sequence-data[length-n]:
    {   count
    }
]

November 2015
'''
from sys import stdin, stdout, stderr, argv
from os import path
from datetime import datetime

BasePath = r'C:\my\path\to\this\stuff'
InputFile  = 'pi.lst'
OutputFile = 'pi-2.out'

MaxLines = 9999999

def pass_one (data):
    '''Input digit data; count digits.'''
    t0 = datetime.now()
    fn = path.join(BasePath, InputFile)
    fp = open(fn, 'r')
    try:
        lines = 0
        chars = 0
        last_char = None
        seq_length = 0
        # Read pi digits (line by line)...
        for txt in fp:
            lines += 1
            txt = txt.strip()
            for n in txt:
                chars += 1
                n = ord(n) - ord('0')
                # ...
                if last_char == n:
                    seq_length += 1
                else:
                    if 1 < seq_length:
                        d = data[last_char]
                        if seq_length not in d:
                            d[seq_length] = 1
                        else:
                            d[seq_length] += 1
                    last_char = n
                    seq_length = 1
            if MaxLines < lines:
                break
        if 1 < seq_length:
            d = data[last_char]
            if seq_length not in d:
                d[seq_length] = 1
            else:
                d[seq_length] += 1
    except: raise
    finally: fp.close()
    # Return result...
    t1 = datetime.now()
    return {'t0':t0, 't1':t1, 'lines':lines, 'chars':chars}

def print_data (data, r1):
    '''Print data.'''
    t0 = datetime.now()
    e1 = (r1['t1']-r1['t0']).total_seconds()
    fn = path.join(BasePath, OutputFile)
    fp = open(fn, 'w')
    try:
        print >> fp, '%(t0)s' % r1
        print >> fp, '%(lines)d lines, %(chars)d chars' % r1
        print >> fp, 'Pass-1: %7.3f seconds' % e1
        print >> fp
        # Print...
        for dx,d in enumerate(data):
            print >> fp, 'Digit: %d' % dx
            for k in sorted(d):
                print >> fp, '|-%s: %s' % (k, d[k])
        print >> fp
        print >> fp
        print >> fp
    except:
        raise
    finally:
        fp.close()
    t1 = datetime.now()
    return {'t0':t0, 't1':t1}

Data = [{} for dx in range(10)]
# Do the thing...
result1 = pass_one(Data)
# Emit the result...
result3 = print_data(Data, result1)

'''eof'''

Both programs assume a text file (pi.lst) containing the digits of pi. The digits are arranged in lines, allowing the file to be read line-by-line. No assumptions are made about the length of the lines or the count of lines.

The MaxLines global variable allows throttling the amount read for testing and development.