import pandas as pd
from pandas import Series,DataFrame
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from __future__ import division

import requests

from StringIO import StringIO

url = 'http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv'

source = requests.get(url).text

poll_data = StringIO(source)

poll_df = pd.read_csv(poll_data)

poll_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 17 columns):
Pollster                  586 non-null object
Start Date                586 non-null object
End Date                  586 non-null object
Entry Date/Time (ET)      586 non-null object
Number of Observations    564 non-null float64
Population                586 non-null object
Mode                      586 non-null object
Obama                     586 non-null float64
Romney                    586 non-null float64
Undecided                 423 non-null float64
Other                     202 non-null float64
Pollster URL              586 non-null object
Source URL                584 non-null object
Partisan                  586 non-null object
Affiliation               586 non-null object
Question Text             0 non-null float64
Question Iteration        586 non-null int64
dtypes: float64(6), int64(1), object(10)
memory usage: 77.9+ KB

poll_df.head()

sns.factorplot('Affiliation',data=poll_df,kind='count')

<seaborn.axisgrid.FacetGrid at 0x109a156d0>

sns.factorplot('Affiliation',data=poll_df,hue='Population',kind='count')

<seaborn.axisgrid.FacetGrid at 0x109a155d0>

poll_df.head()

avg = pd.DataFrame(poll_df.mean())

avg.drop('Number of Observations',axis=0,inplace=True)

avg.head()

std = pd.DataFrame(poll_df.std())

std.drop('Number of Observations',axis=0,inplace=True)

std.head()

avg.plot(yerr=std,kind='bar',legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x10a4b93d0>

poll_avg = pd.concat([avg,std],axis=1)

poll_avg.columns = ['Average','STD']

poll_avg

poll_df.head()

poll_df.plot(x='End Date',y=['Obama','Romney','Undecided'],linestyle='',marker='o')

<matplotlib.axes._subplots.AxesSubplot at 0x10a5cddd0>

from datetime import datetime

poll_df['Difference']=(poll_df.Obama-poll_df.Romney)/100

poll_df.head()

poll_df=poll_df.groupby(['Start Date'],as_index=False).mean()

poll_df.head()

poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple')

<matplotlib.axes._subplots.AxesSubplot at 0x10a5b0e50>

row_in=0
xlimit=[]

for date in poll_df['Start Date']:
    if date[0:7]=='2012-10':
        xlimit.append(row_in)
        row_in+=1
    else:
        row_in+=1
print min(xlimit)
print max(xlimit)

325
352

poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(329,356))

# Oct 3rd
plt.axvline(x=329+2,linewidth=4,color='grey')

# Oct 11th
plt.axvline(x=329+10,linewidth=4,color='grey')

# Oct 22nd
plt.axvline(x=329+21,linewidth=4,color='grey')

<matplotlib.lines.Line2D at 0x10af3f990>

pwd

u'/Users/takuoyoneda/Documents/git/learning_python_for_data_analysis_and_visualization/learning_python_for_data_analysis_and_visualization/election_analysis'

donor_df = pd.read_csv('Election_Donor_Data.csv')

donor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
cmte_id              1001731 non-null object
cand_id              1001731 non-null object
cand_nm              1001731 non-null object
contbr_nm            1001731 non-null object
contbr_city          1001712 non-null object
contbr_st            1001727 non-null object
contbr_zip           1001620 non-null object
contbr_employer      988002 non-null object
contbr_occupation    993301 non-null object
contb_receipt_amt    1001731 non-null float64
contb_receipt_dt     1001731 non-null object
receipt_desc         14166 non-null object
memo_cd              92482 non-null object
memo_text            97770 non-null object
form_tp              1001731 non-null object
file_num             1001731 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB

donor_df.head()

donor_df['contb_receipt_amt'].value_counts()

 100.00     178188
 50.00      137584
 25.00      110345
 250.00      91182
 500.00      57984
 2500.00     49005
 35.00       37237
 1000.00     36494
 10.00       33986
 200.00      27813
 20.00       17565
 15.00       16163
 150.00      14600
 75.00       13647
 201.20      11718
 30.00       11381
 300.00      11204
 20.12        9897
 5.00         9024
 40.00        5007
 2000.00      4128
 55.00        3760
 1500.00      3705
 3.00         3383
 60.00        3084
 400.00       3066
-2500.00      2727
 110.00       2554
 125.00       2520
 19.00        2474
             ...  
 174.80          1
 7.27            1
 1219.00         1
 1884.88         1
 162.25          1
 218.31          1
 78.62           1
 203.16          1
 53.11           1
 499.66          1
 19.53           1
 188.60          1
 47.10           1
 19.85           1
 28.83           1
 202.59          1
-5500.00         1
 9.25            1
 202.66          1
 1205.00         1
 80.73           1
 115.07          1
 213.69          1
 70.76           1
 144.13          1
 97.15           1
 122.32          1
 188.65          1
 122.40          1
 132.12          1
Name: contb_receipt_amt, Length: 8079, dtype: int64

don_mean = donor_df['contb_receipt_amt'].mean()

don_std = donor_df['contb_receipt_amt'].std()

print 'The average donation was %.2f with a std %.2f' %(don_mean,don_std)

The average donation was 298.24 with a std 3749.67

top_donor = donor_df['contb_receipt_amt'].copy()

top_donor.sort_index()
#top_donor.sort_values()

top_donor

0           250.0
1            50.0
2           250.0
3           250.0
4           300.0
5           500.0
6           250.0
7           250.0
8           250.0
9           250.0
10          250.0
11          500.0
12          250.0
13          250.0
14          250.0
15          300.0
16          500.0
17         1000.0
18          250.0
19          300.0
20          500.0
21          250.0
22         2500.0
23         2500.0
24          150.0
25          200.0
26          100.0
27          250.0
28          500.0
29          250.0
            ...  
1001701    2500.0
1001702    2500.0
1001703   -2500.0
1001704   -2500.0
1001705    1000.0
1001706    2500.0
1001707   -2500.0
1001708    2500.0
1001709   -2500.0
1001710   -2500.0
1001711    1000.0
1001712    2500.0
1001713    2500.0
1001714     250.0
1001715     250.0
1001716    1000.0
1001717     100.0
1001718    2500.0
1001719    2500.0
1001720     100.0
1001721     250.0
1001722     100.0
1001723     100.0
1001724     500.0
1001725    2500.0
1001726    5000.0
1001727    2500.0
1001728     500.0
1001729     500.0
1001730    2500.0
Name: contb_receipt_amt, Length: 1001731, dtype: float64

top_donor = top_donor[top_donor > 0]

top_donor.sort_index()

0           250.0
1            50.0
2           250.0
3           250.0
4           300.0
5           500.0
6           250.0
7           250.0
8           250.0
9           250.0
10          250.0
11          500.0
12          250.0
13          250.0
14          250.0
15          300.0
16          500.0
17         1000.0
18          250.0
19          300.0
20          500.0
21          250.0
22         2500.0
23         2500.0
24          150.0
25          200.0
26          100.0
27          250.0
28          500.0
29          250.0
            ...  
1001696    1000.0
1001697    1500.0
1001698    2500.0
1001699    2500.0
1001700     300.0
1001701    2500.0
1001702    2500.0
1001705    1000.0
1001706    2500.0
1001708    2500.0
1001711    1000.0
1001712    2500.0
1001713    2500.0
1001714     250.0
1001715     250.0
1001716    1000.0
1001717     100.0
1001718    2500.0
1001719    2500.0
1001720     100.0
1001721     250.0
1001722     100.0
1001723     100.0
1001724     500.0
1001725    2500.0
1001726    5000.0
1001727    2500.0
1001728     500.0
1001729     500.0
1001730    2500.0
Name: contb_receipt_amt, Length: 991475, dtype: float64

top_donor.value_counts().head(10)

100.0     178188
50.0      137584
25.0      110345
250.0      91182
500.0      57984
2500.0     49005
35.0       37237
1000.0     36494
10.0       33986
200.0      27813
Name: contb_receipt_amt, dtype: int64

com_don = top_donor[top_donor < 2500]

com_don.hist(bins=100,)

<matplotlib.axes._subplots.AxesSubplot at 0x136ae8050>

candidates = donor_df.cand_nm.unique()

candidates

array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick', 'Cain, Herman',
       'Gingrich, Newt', 'McCotter, Thaddeus G', 'Huntsman, Jon',
       'Perry, Rick'], dtype=object)

party_map = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}

donor_df['Party']=donor_df.cand_nm.map(party_map)

donor_df=donor_df[donor_df.contb_receipt_amt > 0]

donor_df.head()

donor_df.groupby('cand_nm')['contb_receipt_amt'].count()

cand_nm
Bachmann, Michelle                 13082
Cain, Herman                       20052
Gingrich, Newt                     46883
Huntsman, Jon                       4066
Johnson, Gary Earl                  1234
McCotter, Thaddeus G                  73
Obama, Barack                     589127
Paul, Ron                         143161
Pawlenty, Timothy                   3844
Perry, Rick                        12709
Roemer, Charles E. 'Buddy' III      5844
Romney, Mitt                      105155
Santorum, Rick                     46245
Name: contb_receipt_amt, dtype: int64

donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()

cand_nm
Bachmann, Michelle                2.711439e+06
Cain, Herman                      7.101082e+06
Gingrich, Newt                    1.283277e+07
Huntsman, Jon                     3.330373e+06
Johnson, Gary Earl                5.669616e+05
McCotter, Thaddeus G              3.903000e+04
Obama, Barack                     1.358774e+08
Paul, Ron                         2.100962e+07
Pawlenty, Timothy                 6.004819e+06
Perry, Rick                       2.030575e+07
Roemer, Charles E. 'Buddy' III    3.730099e+05
Romney, Mitt                      8.833591e+07
Santorum, Rick                    1.104316e+07
Name: contb_receipt_amt, dtype: float64

cand_amount=donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()

i=0

for don in cand_amount:
    print ' The candidate %s raise %.0f dolloars' %(cand_amount.index[i],don)
    print ''
    i+=1

 The candidate Bachmann, Michelle raise 2711439 dolloars

 The candidate Cain, Herman raise 7101082 dolloars

 The candidate Gingrich, Newt raise 12832770 dolloars

 The candidate Huntsman, Jon raise 3330373 dolloars

 The candidate Johnson, Gary Earl raise 566962 dolloars

 The candidate McCotter, Thaddeus G raise 39030 dolloars

 The candidate Obama, Barack raise 135877427 dolloars

 The candidate Paul, Ron raise 21009620 dolloars

 The candidate Pawlenty, Timothy raise 6004819 dolloars

 The candidate Perry, Rick raise 20305754 dolloars

 The candidate Roemer, Charles E. 'Buddy' III raise 373010 dolloars

 The candidate Romney, Mitt raise 88335908 dolloars

 The candidate Santorum, Rick raise 11043159 dolloars

cand_amount.plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x10e396410>

donor_df.groupby('Party')['contb_receipt_amt'].sum().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x10e504610>

occupation_df=donor_df.pivot_table('contb_receipt_amt',
                                   index='contbr_occupation',
                                   columns='Party',
                                   aggfunc='sum')

occupation_df.head()

occupation_df.tail()

occupation_df.shape

(45067, 2)

occupation_df=occupation_df[occupation_df.sum(1) > 1000000]

occupation_df.shape

(31, 2)

occupation_df.plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x10ec25810>

occupation_df.plot(kind='barh',figsize=(10,12),cmap='seismic')

<matplotlib.axes._subplots.AxesSubplot at 0x10f4948d0>

occupation_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS','INFORMATION REQUESTED'],axis=0,inplace=True)

occupation_df.loc['CEO']=occupation_df.loc['CEO']+occupation_df.loc['C.E.O.']

occupation_df.drop('C.E.O.',inplace=True)

occupation_df.plot(kind='barh',figsize=(10,12),cmap='seismic')

<matplotlib.axes._subplots.AxesSubplot at 0x10f57a690>

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	http://elections.huffingtonpost.com/pollster/p...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	http://elections.huffingtonpost.com/pollster/p...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1

	Average	STD
Obama	46.805461	2.422058
Romney	44.614334	2.906180
Undecided	6.550827	3.701754
Other	3.376238	2.692726
Question Text	NaN	NaN
Question Iteration	1.000000	0.000000

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	http://elections.huffingtonpost.com/pollster/p...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration	Difference
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1	0.00
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1	0.02
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1	0.00
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	http://elections.huffingtonpost.com/pollster/p...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1	0.01
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	http://elections.huffingtonpost.com/pollster/p...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1	-0.01

	Start Date	Number of Observations	Obama	Romney	Undecided	Other	Question Text	Question Iteration	Difference
0	2009-03-13	1403.0	44.0	44.0	12.0	NaN	NaN	1	0.00
1	2009-04-17	686.0	50.0	39.0	11.0	NaN	NaN	1	0.11
2	2009-05-14	1000.0	53.0	35.0	12.0	NaN	NaN	1	0.18
3	2009-06-12	638.0	48.0	40.0	12.0	NaN	NaN	1	0.08
4	2009-07-15	577.0	49.0	40.0	11.0	NaN	NaN	1	0.09

	cmte_id	cand_id	cand_nm	contbr_nm	contbr_city	contbr_st	contbr_zip	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	receipt_desc	memo_cd	memo_text	form_tp	file_num
0	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	3.6601e+08	RETIRED	RETIRED	250.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166
1	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	3.6601e+08	RETIRED	RETIRED	50.0	23-JUN-11	NaN	NaN	NaN	SA17A	736166
2	C00410118	P20002978	Bachmann, Michelle	SMITH, LANIER	LANETT	AL	3.68633e+08	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	NaN	NaN	NaN	SA17A	749073
3	C00410118	P20002978	Bachmann, Michelle	BLEVINS, DARONDA	PIGGOTT	AR	7.24548e+08	NONE	RETIRED	250.0	01-AUG-11	NaN	NaN	NaN	SA17A	749073
4	C00410118	P20002978	Bachmann, Michelle	WARDENBURG, HAROLD	HOT SPRINGS NATION	AR	7.19016e+08	NONE	RETIRED	300.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166

Party	Democrat	Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER	100.0	NaN
AREA VICE PRESIDENT	250.0	NaN
RESEARCH ASSOCIATE	100.0	NaN
TEACHER	500.0	NaN
THERAPIST	3900.0	NaN

Party	Democrat	Republican
contbr_occupation
ZOOKEEPER	35.0	NaN
ZOOLOGIST	400.0	NaN
ZOOLOGY EDUCATION	25.0	NaN
\NONE\	NaN	250.0
~	NaN	75.0