JNB Lab Solutions

10.8. JNB Lab Solutions#

10.8.1. Section 1 Exercise#

	school_id	legacy_unit_id	finance_id	short_name	long_name	primary_category	is_high_school	is_middle_school	is_elementary_school	is_pre_school	...	fifth_contact_title	fifth_contact_name	seventh_contact_title	seventh_contact_name	refugee_services	visual_impairments	freshman_start_end_time	sixth_contact_title	sixth_contact_name	hard_of_hearing
0	609966	3750	23531	HAMMOND	Charles G Hammond Elementary School	ES	False	True	True	True	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

1 rows × 92 columns

raw_CPS_data.columns

Index(['school_id', 'legacy_unit_id', 'finance_id', 'short_name', 'long_name',
       'primary_category', 'is_high_school', 'is_middle_school',
       'is_elementary_school', 'is_pre_school', 'summary',
       'administrator_title', 'administrator', 'secondary_contact_title',
       'secondary_contact', 'address', 'city', 'state', 'zip', 'phone', 'fax',
       'cps_school_profile', 'website', 'facebook', 'attendance_boundaries',
       'grades_offered_all', 'grades_offered', 'student_count_total',
       'student_count_low_income', 'student_count_special_ed',
       'student_count_english_learners', 'student_count_black',
       'student_count_hispanic', 'student_count_white', 'student_count_asian',
       'student_count_native_american', 'student_count_other_ethnicity',
       'student_count_asian_pacific', 'student_count_multi',
       'student_count_hawaiian_pacific', 'student_count_ethnicity_not',
       'statistics_description', 'demographic_description', 'dress_code',
       'prek_school_day', 'kindergarten_school_day', 'school_hours',
       'after_school_hours', 'earliest_drop_off_time', 'classroom_languages',
       'bilingual_services', 'title_1_eligible', 'preschool_inclusive',
       'preschool_instructional', 'transportation_bus', 'transportation_el',
       'school_latitude', 'school_longitude', 'overall_rating',
       'rating_status', 'rating_statement', 'classification_description',
       'school_year', 'third_contact_title', 'third_contact_name', 'network',
       'is_gocps_participant', 'is_gocps_prek', 'is_gocps_elementary',
       'is_gocps_high_school', 'open_for_enrollment_date', 'twitter',
       'youtube', 'pinterest', 'college_enrollment_rate_school',
       'college_enrollment_rate_mean', 'graduation_rate_school',
       'graduation_rate_mean', 'significantly_modified',
       'transportation_metra', 'fourth_contact_title', 'fourth_contact_name',
       'fifth_contact_title', 'fifth_contact_name', 'seventh_contact_title',
       'seventh_contact_name', 'refugee_services', 'visual_impairments',
       'freshman_start_end_time', 'sixth_contact_title', 'sixth_contact_name',
       'hard_of_hearing'],
      dtype='object')

raw_CPS_data['grades_offered'].value_counts()

PK,K-8       327
9-12         144
K-8           82
7-12          11
PK,K-6        10
PK,K-5        10
6-12           9
K-6            8
6-8            6
PK,K-4         4
K-12           4
PE,PK,K-8      4
11-12          4
5-8            3
PK             3
K-5            3
PK,K-3         3
8-12           2
PK,K-2         2
7-8            2
PK,3-8         1
9              1
K,4-8          1
K-1,5-8        1
3-12           1
K-3,5-8        1
1-8            1
PK,K-7         1
10-12          1
4-11           1
K-3            1
K-2            1
4-8            1
Name: grades_offered, dtype: int64

df=raw_CPS_data[['address','student_count_total','student_count_black','student_count_hispanic','student_count_white','zip']]
df23=df[df['zip']==60623]
df23=df23.reset_index(drop=True)
df23.columns= ["address","total","black","hispanic","white","zip"]
df23.head(1)

	address	total	black	hispanic	white	zip
0	2819 W 21ST PL	342	33	304	2	60623

for i in df23.index:
    df23.loc[i,'%black']=round(100*df23.loc[i,'black']/df23.loc[i,'total'],1)
    df23.loc[i,'%hispanic']=round(100*df23.loc[i,'hispanic']/df23.loc[i,'total'],1)
    df23.loc[i,'%white']=round(df23.loc[i,'white']/df23.loc[i,'total'],1)
df23.head(1)

	address	total	black	hispanic	white	zip	%black	%hispanic	%white
0	2819 W 21ST PL	342	33	304	2	60623	9.6	88.9	0.0

Intercept is  [98.71284906]
Slope is  [[-0.99515441]]
R^2 for OLS is  0.9996943205528053

../../_images/156821f64b6df9e4354e43c62c97880ff67a7b382c6914870808cd8bed7c7c66.png

10.8.2. Section 2 Exercise#

Let’s take a look at the 22 clusters.

cluster22_map = make_map(22,violence)
cluster22_map

C:\Users\pisihara\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
C:\Users\pisihara\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=4.
  warnings.warn(

Make this Notebook Trusted to load map: File -> Trust Notebook

Here is an overlay of this map onto police districts.

While it is not perfect, our k-means algorithm clustered the violent occurrences in a similar manner to the police district boundaries.

10.8.3. Section 3 Exercise#

Show code cell source Hide code cell source

def imagetovector(npix,directory,nimages):
    n=npix  #use nxn pixel image
    # You'll want to store all your images in a folder within the same directory as this notebook. 
    # Enter the name of that directory below.
    directory = directory # example: "images"

    # Dictionaries to store the image data and the dataframes we'll make from them.
    # The dataframes are used to translate data to and from excel.
    imgs = {}
    dfs = {}

    # Each image will be resized to ensure that their proportions are consistent with each other.
    # It's best to start with images that are already similarly sized so that images don't get
    # too distorted in the resize process. 
    # Adjust the size to your preference: (width, height)
    dsize = (n, n)

    # This will iterate over every image in the directory given, read it into data, and create a 
    # dataframe for it. Both the image data and its corresponding dataframe are stored.
    # Note that when being read into data, we interpret the image as grayscale. 
    pos = 0
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            imgs[pos] = cv2.imread(f, 0) # image data
            imgs[pos] = cv2.resize(imgs[pos], dsize)
            dfs[pos] = pd.DataFrame(imgs[pos]) # dataframe
            pos += 1

    # Exports the image dataframes to an excel file, with each excel sheet representing one image.
    # If there's already an excel file by the same name, it will overwrite it. Note that if the
    # excel file it's attempting to overwrite is already open, the write will be blocked.
    with pd.ExcelWriter('image_data.xlsx') as writer: 
        for i in np.arange(0, len(dfs)):
            dfs[i].to_excel(writer, sheet_name=str(i))
    def matrixtovector(matrix,n,s):
        t=0
        vec=pd.DataFrame()
        for i in np.arange(0,n,1):
            for j in np.arange(0,n,1):
                vec.loc[t,str(s)]=matrix.loc[i,j]
                t=t+1
        return vec

    numimages=nimages
    data=pd.DataFrame()
    for t in np.arange(0,numimages,1):
        data.loc[:,str(t)]=matrixtovector(dfs[t],n,t)            
    return data,imgs

[traindata,imgs]=imagetovector(64,"letters",8)
traindata.head(4)

	0	1	2	3	4	5	6	7
0	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0
1	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0
2	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0
3	255.0	255.0	255.0	255.0	255.0	255.0	255.0	255.0

from sklearn.decomposition import PCA
letter=traindata
pca = PCA(n_components=2)
pca.fit(np.transpose(letter))
letter_pca = pca.transform(np.transpose(letter))
filtered = pca.inverse_transform(letter_pca)
print("original shape:   ", np.transpose(letter).shape)
print("transformed shape:", letter_pca.shape)

original shape:    (8, 4096)
transformed shape: (8, 2)

Answer to part a)

fig=plt.figure(figsize=(2,2))
plt.gca().imshow(filtered[4].reshape(64, 64),
                                 cmap="gray")

<matplotlib.image.AxesImage at 0x1d880976490>

../../_images/2a286fa0e86125faeb08ea8f7b89a42d3d00a1dc9673d6ec491d3e09212bf9ca.png

Answer to part b)

Image of first principal component vector

#image corresponding to the 1st basis vector.
fig=plt.figure(figsize=(2,2))
plt.gca().imshow(pca.components_[0].reshape(64, 64),
                                 cmap="gray")

<matplotlib.image.AxesImage at 0x1d880931610>

../../_images/20bceb6bc47a0848de67c179df27212a21480082ca7d7dbad30cb9771a342efe.png

Image of the second principal component vector.

#image corresponding to the 2nd basis vector.
fig=plt.figure(figsize=(2,2))
plt.gca().imshow(pca.components_[1].reshape(64, 64),
                                 cmap="gray")

<matplotlib.image.AxesImage at 0x1d8809d6490>

../../_images/b430ae6a07d27d734370e3c32bd97f88a774e1c8c6d83193d229d0e11ebb7250.png

Answer to c).

letter_pca[1]

array([1286.48056884, 3118.77874561])

Answer to d) We could only model the first two pixels so would not have any idea of the images.

10.8.4. Section 4 Exercise#