Ressources Utiles pour le projet:
https://pandas.pydata.org/docs/getting_started/index.html
https://pandas.pydata.org/docs/user_guide/10min.html#min
https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook
https://towardsdatascience.com/a-checklist-for-data-wrangling-8f106c093fef
# import libraries
import pandas as pd
# charger le fichier locations.csv dans un pandas dataframe locations et afficher les 5 premières lignes de votre df
#TBD
neighborhood | title | price | bedrooms | pid | longitude | date | subregion | link | latitude | sqft | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | (bayview) | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | $950 | / 1br - | 4076905111 | -122.396965 | Sep 18 2013 | SF | /sfc/apa/4076905111.html | 37.761216 | / 1br - |
1 | (bayview) | Only walking distance to major shopping centers. | $950 | / 1br - | 4076901755 | -122.396793 | Sep 18 2013 | SF | /sfc/apa/4076901755.html | 37.761080 | / 1br - |
2 | (bayview) | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | $950 | / 1br - | 4076899340 | -122.397100 | Sep 18 2013 | SF | /sfc/apa/4076899340.html | 37.762100 | / 1br - |
3 | (financial district) | *NEW* Beautiful, Upscale Condo in Historic Jac... | $3300 | / 1br - 830ft² - | 4067393707 | -122.399747 | Sep 18 2013 | SF | /sfc/apa/4067393707.html | 37.798108 | / 1br - 830ft² - |
4 | (visitacion valley) | 楼上全层3房 | $2000 | / 3br - 1280ft² - | 4076901071 | NaN | Sep 18 2013 | SF | /sfc/apa/4076901071.html | NaN | / 3br - 1280ft² - |
# supprimer les parenthèses autour des valeurs de la colonne neighborhood
# supprimer $ de la colonne priceet afficer les valeurs en tant que float
#TBD
neighborhood | title | price | bedrooms | pid | longitude | date | subregion | link | latitude | sqft | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | bayview | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | 950.0 | / 1br - | 4076905111 | -122.396965 | Sep 18 2013 | SF | /sfc/apa/4076905111.html | 37.761216 | / 1br - |
1 | bayview | Only walking distance to major shopping centers. | 950.0 | / 1br - | 4076901755 | -122.396793 | Sep 18 2013 | SF | /sfc/apa/4076901755.html | 37.761080 | / 1br - |
2 | bayview | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | 950.0 | / 1br - | 4076899340 | -122.397100 | Sep 18 2013 | SF | /sfc/apa/4076899340.html | 37.762100 | / 1br - |
3 | financial district | *NEW* Beautiful, Upscale Condo in Historic Jac... | 3300.0 | / 1br - 830ft² - | 4067393707 | -122.399747 | Sep 18 2013 | SF | /sfc/apa/4067393707.html | 37.798108 | / 1br - 830ft² - |
4 | visitacion valley | 楼上全层3房 | 2000.0 | / 3br - 1280ft² - | 4076901071 | NaN | Sep 18 2013 | SF | /sfc/apa/4076901071.html | NaN | / 3br - 1280ft² - |
# Eclater le contenu de la colonne data en colonnes: month day year et supprimer la colonne d'orig (date)
#TBD
neighborhood | title | price | bedrooms | pid | longitude | subregion | link | latitude | sqft | month | day | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | bayview | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | 950.0 | / 1br - | 4076905111 | -122.396965 | SF | /sfc/apa/4076905111.html | 37.761216 | / 1br - | Sep | 18 | 2013 |
1 | bayview | Only walking distance to major shopping centers. | 950.0 | / 1br - | 4076901755 | -122.396793 | SF | /sfc/apa/4076901755.html | 37.761080 | / 1br - | Sep | 18 | 2013 |
2 | bayview | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | 950.0 | / 1br - | 4076899340 | -122.397100 | SF | /sfc/apa/4076899340.html | 37.762100 | / 1br - | Sep | 18 | 2013 |
3 | financial district | *NEW* Beautiful, Upscale Condo in Historic Jac... | 3300.0 | / 1br - 830ft² - | 4067393707 | -122.399747 | SF | /sfc/apa/4067393707.html | 37.798108 | / 1br - 830ft² - | Sep | 18 | 2013 |
4 | visitacion valley | 楼上全层3房 | 2000.0 | / 3br - 1280ft² - | 4076901071 | NaN | SF | /sfc/apa/4076901071.html | NaN | / 3br - 1280ft² - | Sep | 18 | 2013 |
# definir une fonction clean_bedrooms qui permet de changer la colonne bedrooms pour garder uniquement le premier entier
# exemples: /1br - => 1
# /3br - 1280ft² - => 3
#TBD
neighborhood | title | price | bedrooms | pid | longitude | subregion | link | latitude | sqft | month | day | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | bayview | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | 950.0 | 1.0 | 4076905111 | -122.396965 | SF | /sfc/apa/4076905111.html | 37.761216 | / 1br - | Sep | 18 | 2013 |
1 | bayview | Only walking distance to major shopping centers. | 950.0 | 1.0 | 4076901755 | -122.396793 | SF | /sfc/apa/4076901755.html | 37.761080 | / 1br - | Sep | 18 | 2013 |
2 | bayview | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | 950.0 | 1.0 | 4076899340 | -122.397100 | SF | /sfc/apa/4076899340.html | 37.762100 | / 1br - | Sep | 18 | 2013 |
3 | financial district | *NEW* Beautiful, Upscale Condo in Historic Jac... | 3300.0 | 1.0 | 4067393707 | -122.399747 | SF | /sfc/apa/4067393707.html | 37.798108 | / 1br - 830ft² - | Sep | 18 | 2013 |
4 | visitacion valley | 楼上全层3房 | 2000.0 | 3.0 | 4076901071 | NaN | SF | /sfc/apa/4076901071.html | NaN | / 3br - 1280ft² - | Sep | 18 | 2013 |
# definir une fonction clean_surface qui permet de changer la colonne sqft et garder uniquement le dernier entier
# exemple: /3br - 1280ft² - => 1280
#TBD
neighborhood | title | price | bedrooms | pid | longitude | subregion | link | latitude | sqft | month | day | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | bayview | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | 950.0 | 1.0 | 4076905111 | -122.396965 | SF | /sfc/apa/4076905111.html | 37.761216 | NaN | Sep | 18 | 2013 |
1 | bayview | Only walking distance to major shopping centers. | 950.0 | 1.0 | 4076901755 | -122.396793 | SF | /sfc/apa/4076901755.html | 37.761080 | NaN | Sep | 18 | 2013 |
2 | bayview | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | 950.0 | 1.0 | 4076899340 | -122.397100 | SF | /sfc/apa/4076899340.html | 37.762100 | NaN | Sep | 18 | 2013 |
3 | financial district | *NEW* Beautiful, Upscale Condo in Historic Jac... | 3300.0 | 1.0 | 4067393707 | -122.399747 | SF | /sfc/apa/4067393707.html | 37.798108 | 830.0 | Sep | 18 | 2013 |
4 | visitacion valley | 楼上全层3房 | 2000.0 | 3.0 | 4076901071 | NaN | SF | /sfc/apa/4076901071.html | NaN | 1280.0 | Sep | 18 | 2013 |
df.describe()
price | bedrooms | pid | longitude | latitude | sqft | day | year | |
---|---|---|---|---|---|---|---|---|
count | 4908.000000 | 4544.000000 | 5.000000e+03 | 3143.000000 | 3143.000000 | 3178.000000 | 5000.000000 | 5000.0 |
mean | 2656.999389 | 2.066241 | 4.068059e+09 | -122.264948 | 37.757411 | 1173.613593 | 17.523800 | 2013.0 |
std | 1915.147477 | 1.011606 | 1.344453e+07 | 0.278825 | 0.364646 | 751.552623 | 0.766258 | 0.0 |
min | 1.000000 | 1.000000 | 4.008227e+09 | -123.799100 | 36.813820 | 1.000000 | 14.000000 | 2013.0 |
25% | 1695.000000 | 1.000000 | 4.065685e+09 | -122.442365 | 37.469365 | 747.250000 | 17.000000 | 2013.0 |
50% | 2208.500000 | 2.000000 | 4.074290e+09 | -122.283714 | 37.760858 | 1000.000000 | 18.000000 | 2013.0 |
75% | 2995.000000 | 3.000000 | 4.075949e+09 | -122.045047 | 37.900832 | 1350.000000 | 18.000000 | 2013.0 |
max | 35000.000000 | 8.000000 | 4.076908e+09 | -120.034132 | 41.456848 | 12700.000000 | 18.000000 | 2013.0 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 neighborhood 4986 non-null object
1 title 5000 non-null object
2 price 4908 non-null float64
3 bedrooms 4544 non-null float64
4 pid 5000 non-null int64
5 longitude 3143 non-null float64
6 subregion 5000 non-null object
7 link 5000 non-null object
8 latitude 3143 non-null float64
9 sqft 3178 non-null float64
10 month 5000 non-null object
11 day 5000 non-null int32
12 year 5000 non-null int32
dtypes: float64(5), int32(2), int64(1), object(5)
memory usage: 371.2+ KB
df['price'].isna().sum()
92
len(df['price'])
5000
#TBC
en utilisant le dernier Dataframe (avec les données propres) :
filtered.head()
neighborhood | title | price | bedrooms | pid | longitude | subregion | link | latitude | sqft | month | day | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | bayview | Take A TOUR ON OUR ONE FURNISHED BEDROOM TODAY | 950.0 | 1.0 | 4076905111 | -122.396965 | SF | /sfc/apa/4076905111.html | 37.761216 | NaN | Sep | 18 | 2013 |
1 | bayview | Only walking distance to major shopping centers. | 950.0 | 1.0 | 4076901755 | -122.396793 | SF | /sfc/apa/4076901755.html | 37.761080 | NaN | Sep | 18 | 2013 |
2 | bayview | furnished - 1 Bedroom(s), 1 Bath(s), Air Condi... | 950.0 | 1.0 | 4076899340 | -122.397100 | SF | /sfc/apa/4076899340.html | 37.762100 | NaN | Sep | 18 | 2013 |
3 | financial district | *NEW* Beautiful, Upscale Condo in Historic Jac... | 3300.0 | 1.0 | 4067393707 | -122.399747 | SF | /sfc/apa/4067393707.html | 37.798108 | 830.0 | Sep | 18 | 2013 |
4 | visitacion valley | 楼上全层3房 | 2000.0 | 3.0 | 4076901071 | NaN | SF | /sfc/apa/4076901071.html | NaN | 1280.0 | Sep | 18 | 2013 |
#TBC
price | bedrooms | pid | longitude | latitude | sqft | day | year | |
---|---|---|---|---|---|---|---|---|
count | 1936.000000 | 1936.000000 | 1.936000e+03 | 1936.000000 | 1936.000000 | 1936.000000 | 1936.000000 | 1936.0 |
mean | 2598.870351 | 2.107955 | 4.068081e+09 | -122.222475 | 37.734211 | 1181.790806 | 17.541322 | 2013.0 |
std | 1284.523424 | 1.002436 | 1.353531e+07 | 0.284795 | 0.386713 | 608.700661 | 0.746636 | 0.0 |
min | 575.000000 | 1.000000 | 4.012055e+09 | -123.799100 | 36.813820 | 200.000000 | 14.000000 | 2013.0 |
25% | 1800.000000 | 1.000000 | 4.065635e+09 | -122.432609 | 37.423714 | 785.000000 | 17.000000 | 2013.0 |
50% | 2295.000000 | 2.000000 | 4.074286e+09 | -122.243881 | 37.680504 | 1011.000000 | 18.000000 | 2013.0 |
75% | 3000.000000 | 3.000000 | 4.076001e+09 | -122.000592 | 37.947712 | 1400.000000 | 18.000000 | 2013.0 |
max | 9999.000000 | 5.000000 | 4.076903e+09 | -120.063949 | 41.456848 | 6500.000000 | 18.000000 | 2013.0 |
# TBC
stocker votre jeu de données propore dans une Base de données que vous créerez
#TBD