%pylab inline
from IPython.display import clear_output

Populating the interactive namespace from numpy and matplotlib

import pandas as pd
from IPython.core.display import display, HTML

data1 = {'p(x,y)': ['y=0', 'y=1'],
        'x=0': [0.5, 0],
        'x=1': [0, 0.5]}

df_1 = pd.DataFrame(data1)

df_html_1 = df_1.to_html(index=False)

display(HTML(df_html_1))

data2 = {'p(x,y)': ['y=0', 'y=1'],
        'x=0': [0, 0.5],
        'x=1': [0, 0.5]}

df_2 = pd.DataFrame(data2)

df_html_2 = df_2.to_html(index=False)

display(HTML(df_html_2))

def generate_genome(size, g_content, c_content, a_content):

  '''
  This function generates a genome of user defined size and GC content.

  Parameters:
    - Number of bases.
    - % of GC content.
  '''

  g_comp = ['G']*g_content
  c_comp = ['C']*c_content
  a_comp = ['A']*a_content
  t_comp = ['T']*(100-(g_content+c_content+a_content))

  human_genome_composition = np.array(g_comp+a_comp+t_comp+c_comp) # making array with target composition
  human_genome = np.random.choice(human_genome_composition, int(size)) # sampling from composition to generate full genome.

  return ''.join(human_genome) # returns dull genome string form

 %%time
 genome = generate_genome(size = 10e6, g_content = 25, c_content = 25, a_content=25)

genome[:20]

bases , counts =  np.unique(list(genome), return_counts=True)
comps = counts/10e6*100

print("Composition of our mock genome:\n"
        + bases[0]+": " + str(round(comps[0],2)) + " %\n"
        + bases[1]+": " + str(round(comps[1],2)) + " %\n"
        + bases[2]+": " + str(round(comps[2],2)) + " %\n"
        + bases[3]+": " + str(round(comps[3],2)) + " %" )

genome.count('A')

s = 'AAAA'

s.count('AA')

genome.count('AA')

genome = generate_genome(size = 1000, g_content = 50, c_content = 50, a_content=0) #Make genome size manageable for dynamic plotting.
As = 0
Ts = 0
Gs = 0
Cs = 0

while (As+Ts+Gs+Cs)<=200:

    clear_output(wait=True)
    fig,ax = subplots(ncols= 1,nrows =1, sharey = False)
    ax.bar(['A','C','T','G'],[As,Cs,Ts,Gs])
    ax.set_xlabel('Nucleotide')
    ax.set_ylabel('Count')
    sample = np.random.choice(list(genome),1) # making the string genome a list to sample easily

    if sample == 'C':
        Cs += 1
    if sample == 'T':
        Ts += 1
    if sample == 'G':
        Gs += 1
    if sample == 'A':
        As += 1
    plt.show()

genome = generate_genome(size = 1000, g_content = 45, c_content = 45, a_content=5) #Make genome size manageable for dynamic plotting.
As = 0
Ts = 0
Gs = 0
Cs = 0

while (As+Ts+Gs+Cs)<=200:

    clear_output(wait=True)
    fig,ax = subplots(ncols= 1,nrows =1, sharey = False)
    ax.bar(['A','C','T','G'],[As,Cs,Ts,Gs])
    ax.set_xlabel('Nucleotide')
    ax.set_ylabel('Count')
    sample = np.random.choice(list(genome),1) # making the string genome a list to sample easily

    if sample == 'C':
        Cs += 1
    if sample == 'T':
        Ts += 1
    if sample == 'G':
        Gs += 1
    if sample == 'A':
        As += 1
    plt.show()

L = 10e6
1/2 * np.log2(L)

%%time
genomes = [generate_genome(size = 10e6, g_content = 25, c_content = 25, a_content=25) for g in range(10)]
motifs_16 = [generate_genome(size=16, g_content = 25, c_content = 25, a_content=25) for g in range(1000)]
motifs_12 = [generate_genome(size=12, g_content = 25, c_content = 25, a_content=25) for g in range(1000)]
motifs_11 = [generate_genome(size=11, g_content = 25, c_content = 25, a_content=25) for g in range(1000)]
motifs_9 = [generate_genome(size=9, g_content = 25, c_content = 25, a_content=25) for g in range(1000)]

counts_16 = [[genome.count(motif) for motif in np.random.choice(motifs_16,100)] for genome in genomes]
print('100 motifs of length 16 appear, on average ' + str(mean(counts_16)) + u" \u00B1 " + str(round(std(counts_16),2)))

counts_12 = [[genome.count(motif) for motif in np.random.choice(motifs_12,100)] for genome in genomes]
print('100 motifs of length 12 appear, on average ' + str(mean(counts_12)) + u" \u00B1 " + str(round(std(counts_12),2)))

counts_11 = [[genome.count(motif) for motif in np.random.choice(motifs_11,100)] for genome in genomes]
print('100 motifs of length 11 appear, on average ' + str(mean(counts_11)) + u" \u00B1 " + str(round(std(counts_11),2)))

counts_9 = [[genome.count(motif) for motif in np.random.choice(motifs_9,100)] for genome in genomes]
print('100 motifs of length 9 appear, on average ' + str(mean(counts_9)) + u" \u00B1 " + str(round(std(counts_9),2)))

Maximum ignorance/entropy with constraints¶

$\frac{d}{d p_j}\left[-\sum_{i=1}^n p_i \log p_i-\lambda\left(\sum_i p_i-1\right)-\beta\left(\sum_i p_i E_i-E\right)\right]=0 \\ \Longrightarrow p_j=\exp (-1-\lambda) \exp \left(-\beta E_i\right)$¶

Graphical view of Lagrange multiplier¶

Shannon’s Source Coding Theorem¶

$\mathrm{E}[|\mathrm{C}(\mathrm{X})|]=\sum_{\mathrm{x}}|\mathrm{C}(\mathrm{X})| \mathrm{P}(\mathrm{X}=\mathrm{x}) \geq-\sum_{\mathrm{x}} \mathrm{P}(\mathrm{X}=\mathrm{x}) \log \mathrm{P}(\mathrm{X}=\mathrm{x})=\mathrm{H}(\mathrm{X})$¶

Mutual information¶

$ MI(X,Y) = I(X)+I(Y)-I(X,Y)$¶

Information content of DNA sequences¶

- But how can we address the original question we had in mind? (see top of the notebook if you've forgotten already.)¶

p(x,y)	x=0	x=1
y=0	0.5	0.0
y=1	0.0	0.5

p(x,y)	x=0	x=1
y=0	0.0	0.0
y=1	0.5	0.5