"
],
"text/plain": [
" address total black hispanic white zip %black %hispanic \\\n",
"0 2819 W 21ST PL 342 33 304 2 60623 9.6 88.9 \n",
"\n",
" %white \n",
"0 0.0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for i in df23.index:\n",
" df23.loc[i,'%black']=round(100*df23.loc[i,'black']/df23.loc[i,'total'],1)\n",
" df23.loc[i,'%hispanic']=round(100*df23.loc[i,'hispanic']/df23.loc[i,'total'],1)\n",
" df23.loc[i,'%white']=round(df23.loc[i,'white']/df23.loc[i,'total'],1)\n",
"df23.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "34f7bee4",
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Intercept is [98.71284906]\n",
"Slope is [[-0.99515441]]\n",
"R^2 for OLS is 0.9996943205528053\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LinearRegression #sklearn is a machine learning library\n",
"X=df23[[\"%black\"]]\n",
"Y=df23[[\"%hispanic\"]]\n",
"reg=LinearRegression()\n",
"reg.fit(X,Y)\n",
"print(\"Intercept is \", reg.intercept_)\n",
"print(\"Slope is \", reg.coef_)\n",
"print(\"R^2 for OLS is \", reg.score(X,Y))\n",
"# x values on the regression line will be between 0 and 100 with a spacing of .0\n",
"x = np.arange(0, 100 ,.01) \n",
"# define the regression line y = mx+b here\n",
"[[m]]=reg.coef_\n",
"[b]=reg.intercept_\n",
"y = m*x + b \n",
"\n",
"fig=df23.plot(x='%black', y='%hispanic', style='o') \n",
"plt.title('% Black vs % Hispanic in 60623 pre-K - 8 Schools') \n",
"plt.xlabel('% Black') \n",
"plt.ylabel('% Hispanic') \n",
"# plot the regression line \n",
"plt.plot(x,y, 'r') #add the color for red\n",
"plt.legend([],[], frameon=True)\n",
"plt.grid()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "001650de",
"metadata": {},
"source": [
"## Section 2 Exercise"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a5df4121",
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"# read from excel file, dropping all entries with N/A values\n",
"violence = pd.read_csv('Violence.csv').dropna(subset = ['LATITUDE', 'LONGITUDE'])\n",
"\n",
"# Streamline columns to just latitude and longitude, reduce to just first 1000 entries\n",
"violence = violence[['LATITUDE', 'LONGITUDE']].head(1000)\n",
"\n",
"# Reset the index for consistent numbering\n",
"violence = violence.reset_index(drop = True)\n",
"\n",
"# Get the 100 colors used to identify clusters\n",
"colorlist = list(mcolors.XKCD_COLORS.values())[:100]\n",
"\n",
"# Make a map that uses k-means clustering to divide locations into up to 100 clusters \n",
"#the inout variable (clusters) specifies the number of clusters. \n",
"#the input variable data specifies the locations.\n",
"def make_map(clusters,data):\n",
" assert clusters >= 1, \"Number of clusters must be at least 1\"\n",
" assert clusters <= len(colorlist), \"Number of clusters exceeds maximum amount\"\n",
" x=data[['LATITUDE', 'LONGITUDE']]\n",
" k_means = KMeans(n_clusters=clusters)\n",
" k_means.fit(x)\n",
" k_means_labels = k_means.labels_\n",
" \n",
" x['labels'] = k_means_labels\n",
" k_map = folium.Map(location=[41.783, -87.621], tiles=\"Stamen Toner\", zoom_start=10)\n",
"\n",
" for i in np.arange(0,len(x),1): #add parcel data one\n",
" p=[x.loc[i,\"LATITUDE\"],x.loc[i,\"LONGITUDE\"]]# by one to the base map.\n",
" k_map.add_child(folium.CircleMarker(p, radius=1,color=colorlist[x.loc[i, 'labels']], fill = True, fill_opacity = 1))\n",
" return k_map"
]
},
{
"cell_type": "markdown",
"id": "3b8fdf49",
"metadata": {},
"source": [
"Let's take a look at the 22 clusters."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e59e306b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\pisihara\\AppData\\Local\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
" warnings.warn(\n",
"C:\\Users\\pisihara\\AppData\\Local\\anaconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=4.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"
Make this Notebook Trusted to load map: File -> Trust Notebook
"
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster22_map = make_map(22,violence)\n",
"cluster22_map"
]
},
{
"cell_type": "markdown",
"id": "5eae1866",
"metadata": {},
"source": [
"Here is an overlay of this map onto police districts."
]
},
{
"cell_type": "markdown",
"id": "d329d718",
"metadata": {},
"source": [
"\n",
" "
]
},
{
"cell_type": "markdown",
"id": "e61a3d44",
"metadata": {},
"source": [
"While it is not perfect, our k-means algorithm clustered the violent occurrences in a similar manner to the police district boundaries."
]
},
{
"cell_type": "markdown",
"id": "233e4255",
"metadata": {},
"source": [
"## Section 3 Exercise"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "20a06013",
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"def imagetovector(npix,directory,nimages):\n",
" n=npix #use nxn pixel image\n",
" # You'll want to store all your images in a folder within the same directory as this notebook. \n",
" # Enter the name of that directory below.\n",
" directory = directory # example: \"images\"\n",
"\n",
" # Dictionaries to store the image data and the dataframes we'll make from them.\n",
" # The dataframes are used to translate data to and from excel.\n",
" imgs = {}\n",
" dfs = {}\n",
"\n",
" # Each image will be resized to ensure that their proportions are consistent with each other.\n",
" # It's best to start with images that are already similarly sized so that images don't get\n",
" # too distorted in the resize process. \n",
" # Adjust the size to your preference: (width, height)\n",
" dsize = (n, n)\n",
"\n",
" # This will iterate over every image in the directory given, read it into data, and create a \n",
" # dataframe for it. Both the image data and its corresponding dataframe are stored.\n",
" # Note that when being read into data, we interpret the image as grayscale. \n",
" pos = 0\n",
" for filename in os.listdir(directory):\n",
" f = os.path.join(directory, filename)\n",
" # checking if it is a file\n",
" if os.path.isfile(f):\n",
" imgs[pos] = cv2.imread(f, 0) # image data\n",
" imgs[pos] = cv2.resize(imgs[pos], dsize)\n",
" dfs[pos] = pd.DataFrame(imgs[pos]) # dataframe\n",
" pos += 1\n",
"\n",
" # Exports the image dataframes to an excel file, with each excel sheet representing one image.\n",
" # If there's already an excel file by the same name, it will overwrite it. Note that if the\n",
" # excel file it's attempting to overwrite is already open, the write will be blocked.\n",
" with pd.ExcelWriter('image_data.xlsx') as writer: \n",
" for i in np.arange(0, len(dfs)):\n",
" dfs[i].to_excel(writer, sheet_name=str(i))\n",
" def matrixtovector(matrix,n,s):\n",
" t=0\n",
" vec=pd.DataFrame()\n",
" for i in np.arange(0,n,1):\n",
" for j in np.arange(0,n,1):\n",
" vec.loc[t,str(s)]=matrix.loc[i,j]\n",
" t=t+1\n",
" return vec\n",
"\n",
" numimages=nimages\n",
" data=pd.DataFrame()\n",
" for t in np.arange(0,numimages,1):\n",
" data.loc[:,str(t)]=matrixtovector(dfs[t],n,t) \n",
" return data,imgs"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5e6e8bad",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#image corresponding to the 2nd basis vector.\n",
"fig=plt.figure(figsize=(2,2))\n",
"plt.gca().imshow(pca.components_[1].reshape(64, 64),\n",
" cmap=\"gray\")"
]
},
{
"cell_type": "markdown",
"id": "1ab50b70",
"metadata": {},
"source": [
"Answer to c)."
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4371f192",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1286.48056884, 3118.77874561])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"letter_pca[1]"
]
},
{
"cell_type": "markdown",
"id": "27863c13",
"metadata": {},
"source": [
"Answer to d) We could only model the first two pixels so would not have any idea of the images."
]
},
{
"cell_type": "markdown",
"id": "205ed0cf",
"metadata": {},
"source": [
"## Section 4 Exercise"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2c093d9e",
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"def imagetovector(npix,directory,nimages):\n",
" n=npix #use nxn pixel image\n",
" # You'll want to store all your images in a folder within the same directory as this notebook. \n",
" # Enter the name of that directory below.\n",
" directory = directory # example: \"images\"\n",
"\n",
" # Dictionaries to store the image data and the dataframes we'll make from them.\n",
" # The dataframes are used to translate data to and from excel.\n",
" imgs = {}\n",
" dfs = {}\n",
"\n",
" # Each image will be resized to ensure that their proportions are consistent with each other.\n",
" # It's best to start with images that are already similarly sized so that images don't get\n",
" # too distorted in the resize process. \n",
" # Adjust the size to your preference: (width, height)\n",
" dsize = (n, n)\n",
"\n",
" # This will iterate over every image in the directory given, read it into data, and create a \n",
" # dataframe for it. Both the image data and its corresponding dataframe are stored.\n",
" # Note that when being read into data, we interpret the image as grayscale. \n",
" pos = 0\n",
" for filename in os.listdir(directory):\n",
" f = os.path.join(directory, filename)\n",
" # checking if it is a file\n",
" if os.path.isfile(f):\n",
" imgs[pos] = cv2.imread(f, 0) # image data\n",
" imgs[pos] = cv2.resize(imgs[pos], dsize)\n",
" dfs[pos] = pd.DataFrame(imgs[pos]) # dataframe\n",
" pos += 1\n",
"\n",
" # Exports the image dataframes to an excel file, with each excel sheet representing one image.\n",
" # If there's already an excel file by the same name, it will overwrite it. Note that if the\n",
" # excel file it's attempting to overwrite is already open, the write will be blocked.\n",
" with pd.ExcelWriter('image_data.xlsx') as writer: \n",
" for i in np.arange(0, len(dfs)):\n",
" dfs[i].to_excel(writer, sheet_name=str(i))\n",
" def matrixtovector(matrix,n,s):\n",
" t=0\n",
" vec=pd.DataFrame()\n",
" for i in np.arange(0,n,1):\n",
" for j in np.arange(0,n,1):\n",
" vec.loc[t,str(s)]=matrix.loc[i,j]\n",
" t=t+1\n",
" return vec\n",
"\n",
" numimages=nimages\n",
" data=pd.DataFrame()\n",
" for t in np.arange(0,numimages,1):\n",
" data.loc[:,str(t)]=matrixtovector(dfs[t],n,t) \n",
" return data,imgs "
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f991970b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"