| Line 1: |
Line 1: |
| − | == Install == | + | ==Install== |
| | <syntaxhighlight lang="bash"> | | <syntaxhighlight lang="bash"> |
| | pip install pandas | | pip install pandas |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | | | |
| − | == Read CSV == | + | ==Read CSV== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df = pd.read_csv('news_2019.05.10.csv') |
| | + | </syntaxhighlight> |
| | + | |
| | + | ==DataSeries== |
| | + | <syntaxhighlight lang="python3"> |
| | + | s = pd.Series(['banana', 42]) |
| | + | s = pd.Series(['banana', 42], index=['Fruit', 'Calories']) |
| | + | s.values |
| | + | s.keys() |
| | + | s.values[0] |
| | + | s.keys()[0] |
| | + | s.min() |
| | + | s.max() |
| | + | s.std() |
| | + | </syntaxhighlight> |
| | + | |
| | + | ==Dataframe== |
| | + | |
| | + | ===Create=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | scientists = pd.DataFrame({ |
| | + | 'Name': ['Rosaline Franklin', 'William Gosset'], |
| | + | 'Occupation': ['Chemist', 'Statistician'], |
| | + | 'Born': ['1920-07-25', '1876-06-13'], |
| | + | 'Died': ['1958-04-16', '1937-10-16'], |
| | + | }) |
| | + | </syntaxhighlight>If you want to keep the order of columns:<syntaxhighlight lang="python3"> |
| | + | scientists = pd.DataFrame({ |
| | + | 'Occupation': ['Chemist', 'Statistician'], |
| | + | 'Born': ['1920-07-25', '1876-06-13'], |
| | + | 'Died': ['1958-04-16', '1937-10-16'], |
| | + | }, index=['Rosaline Franklin', 'William Gosset'], columns=['Occupation', 'Born', 'Died']) |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===From CSV=== |
| | <syntaxhighlight lang="python3"> | | <syntaxhighlight lang="python3"> |
| | news = pd.read_csv('news_2019.05.10.csv') | | news = pd.read_csv('news_2019.05.10.csv') |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | + | |
| | + | ===Info=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.index |
| | + | df.columns |
| | + | df.values |
| | + | df.shape |
| | + | df.dtypes |
| | + | df.head() |
| | + | df.tail() |
| | + | df.info() |
| | + | df.describe() |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select 1 row=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.iloc[[1]] # for positional indexing |
| | + | df.loc[0] # for label based |
| | + | df.iloc[-1] == df.loc[df.shape[0]-1] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select specific rows=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.loc[[9, 99, 999]] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select 1 column=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | sumarys = df[['summary']] |
| | + | # Or |
| | + | list(df['one']) |
| | + | dfToList = df['one'].tolist() |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select multiple columns=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df[['column1', 'column2', 'column3']] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select 1 cell=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.iloc[1][1] |
| | + | df.iloc[1]['summary'] |
| | + | df.iloc[1, 3] |
| | + | |
| | + | df.loc[1, 'summary'] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Subset multiple rows and multiple columns=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.iloc[[1,34,56],[2,4,5]] |
| | + | df.loc[[1,34,56],['modification_date', 'content']] |
| | + | </syntaxhighlight> |
| | + | [[Category:Python]] |
| | + | [[Category:DataScience]] |