# -*- coding: utf-8 -*- # 3.0 # # For the sqrt function from math import sqrt # # In this class we shall be concentrating on what we have learnt to write some programs. This begins the math # part of your course. To start with let us do some statistical computations. We shall input the data as lists # of floats. We shall see later how to read such data from files. # # The data # data_mid = [23, 45, 83, 90, 12, 87, 67, 69, 74, 36, 43, 69, 66, 70] data_end = [45, 44, 95, 87, 24, 100, 45, 70, 66, 32, 50, 55, 80, 81] # # Mean # # First let us find the means to see if the class performance had any improvement after mid-sem. Let us write it # as a function. Recall that mean of a collection of numbers, $x_i$, $i = 1, \ldots, n$ is given by the formula # # \begin{equation} # \text{mean} = \frac{\sum_{i = 1}^n x_i}{n}. # \end{equation} # def find_mean(lst) : """Given a list as input, this function computes the mean.""" # Store the sum in a variable sum = 0.0 # Quiz : Why 0.0 and not just 0? # loop over all the numbers in the list for no in lst : sum += no # The number of elements in the list no_of_entries = len(lst) # mean by definition is mean = sum/no_of_entries # now, we return the value return mean # # Let us try this out # print """The mean for mid-sem is %6.2f, while that for the end-sem is %6.2f.""" % (find_mean(data_mid), find_mean(data_end)) # # Standard deviation # # Looks like there is a general increase. But increase of mean improvement. It might have been that some # substantial number of people actually did slightly worse, but a few people did exceptionally well in the end # sem. One measure to check the spread is call standard deviation. The formula, for $x_i$ as above, is # # \begin{equation} # \text{standard deviation} = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \mu)^2} # \end{equation} # # where $\mu$ is the mean. # # Let us try to code this. # def find_sd(lst) : """Given a list, this function computes the (biased) standard deviation.""" # This one depends on the function find_mean (). Let us find the mean. mu = find_mean(lst) # Rest of the code is similar to mean. Introduce a variable to store the sum of squares. sum_sq_dev = 0.0 # Loop over the data to find this sum of squared deviations from the mean. for no in lst : sum_sq_dev += (no - mu) ** 2 # To compute s.d. we also need to know the number of data points: n = len(lst) # Now to finish computing sd, we just need to divide by n and take square root. sd = sqrt(sum_sq_dev / n) # Don't ever forget to return your hard work. return sd # # Let us try it out. # print """The standard deviations for the two exams are %6.2f , %6.2f respectively.""" % (find_sd(data_mid), find_sd(data_end)) # # This code is very intuitive. However we are running through the data twice, once for computing mean and once # for standard deviation. To save a bit of work, one can do a bit of simplification : Note that # $n \mu = \sum_i x_i$. Therefore, # \begin{equation} # \sum_i (x_i - \mu)^2 = \sum_i (x_i^2 - 2\mu x_i + \mu^2) # = \sum_i x_i^2 - 2 \mu \sum_i x_i + n \mu^2 = \sum_i x_i^2 - 2n\mu^2 + n \mu^2 # = \sum_i x_i^2 - n \mu^2 \\ # = \sum_i x_i^2 - \frac{1}{n}\left(\sum_i x_i\right)^2. # \end{equation} # # To make use of this we use one loop to compute both sum of the numbers and sum of their squares. Then use # these computations to compute the sd. # def find_sd2(lst) : """Given a list, this function computes the (biased) standard deviation more efficiently.""" # We need a variable to store the sum, and another one to store the sum of squares. sum = 0.0 sum_sq = 0.0 # Loop over the data to find the sum and the sum of squares. for no in lst : sum += no sum_sq += no ** 2 # To compute s.d., and the sum of squares of deviations, we also need to know the number # of data points: n = len(lst) # Using this compute the sum of squares of deviations sum_sq_dev = sum_sq - sum**2 / n # Now to finish computing sd, we just need to divide by n and take square root. sd = sqrt(sum_sq_dev / n) # Don't ever forget to return your hard work. return sd # # Let us try to use it : # print """The standard deviations (using the second function) for the two exams are %6.2f , %6.2f respectively.""" % (find_sd2(data_mid), find_sd2(data_end)) # # Correlation # # Things seem to be better. But have the people who scored high in the first exam, score high in the second too? # To see that there is a measure called correlation. The formula is used on two sets of data and the formula # spills out a value between -1 and 1. The formula is # # \begin{equation} # \text{Correlation} = \frac{\text{Covariance}}{(\text{s.d. of } X)(\text{s.d. of } Y)} # \end{equation} # # where # # \begin{equation} # \text{Covariance} = \frac{1}{n} \sum_{i=1}^n (x_i - m_x)(y_i - m_y) # \end{equation} # # x_i, y_i being the data given of size n, m_x and m_y being the of x and y resp and n is the number of data pairs. # # As before we we try to simplify the formula so that we can compute using just one loop to compute. # # \begin{equation}\sum_i (x_i - m_x)(y_i - m_y) = \sum_i x_i y_i - m_x \sum_i y_i - m_y \sum_i x_i + n m_x m_y # = \sum_i x_i y_i - n m_x m_y - n m_x m_y + n m_x m_y\end{equation} # # \begin{equation}= \sum_i x_i y_i - \frac{1}{n} \left(\sum_i x_i\right)\left(\sum_i y_i\right).\end{equation} # # It makes sense to write the correlation function for a list of pairs. We can use that on our data using zip. # def my_corr(lst_of_2_tuples) : """Given a list of 2-tuples, this functions computes the correlation between the first entries and the second entries.""" # As before we use a huge bunch of variables. sumx = 0.0 sumy = 0.0 sumxy = 0.0 sumx2 = 0.0 sumy2 = 0.0 # Now loop for pair in lst_of_2_tuples : # To make reading easier, set x = pair[0] y = pair[1] # Now accumulate sumx += x sumy += x sumxy += x * y sumx2 += x * x sumy2 += y * y # Now we got all the ingredients to compute covariance and s.d. except n : n = len(lst_of_2_tuples) # Now compute covariance = sumxy - sumx * sumy / n sdx = sqrt(sumx2 - sumx**2/n) sdy = sqrt(sumy2 - sumy**2/n) correlation = covariance / (sdx * sdy) return correlation # # Let us try this # print "Correlation is %6.4f." % (my_corr(zip(data_mid, data_end))) # # Some general remarks # # Soon we shall learn how to read a data from a file (and if time permits, from a webpage.) # # Try this website : http://people.csail.mit.edu/pgbovine/python/tutor.html . # # List comprehension # # (Ref Pg. 63) Syntax : new_list = [f(e) for e in some_other_list] # list_of_first_100_odds = [(2*n + 1) for n in range(100)] print list_of_first_100_odds # random_list1 = [3, 6, 1] random_list2 = [40, 70] sum_list = [[(i + j) for i in random_list1] for j in random_list2] print sum_list # # We can traverse a list also as follows # marks = zip(data_mid, data_end) print "Marks : ", print marks print "-"*70 print "Mid\tEnd" for m, e in marks : print "%5.1f\t%5.1f" % (m, e)