| Line 5: |
Line 5: |
| | | | |
| | ==Read CSV== | | ==Read CSV== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df = pd.read_csv('news_2019.05.10.csv') |
| | + | </syntaxhighlight> |
| | + | |
| | + | ==DataSeries== |
| | + | <syntaxhighlight lang="python3"> |
| | + | s = pd.Series(['banana', 42]) |
| | + | s = pd.Series(['banana', 42], index=['Fruit', 'Calories']) |
| | + | s.values |
| | + | s.keys() |
| | + | s.values[0] |
| | + | s.keys()[0] |
| | + | s.min() |
| | + | s.max() |
| | + | s.std() |
| | + | </syntaxhighlight> |
| | + | |
| | + | ==Dataframe== |
| | + | |
| | + | ===Create=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | scientists = pd.DataFrame({ |
| | + | 'Name': ['Rosaline Franklin', 'William Gosset'], |
| | + | 'Occupation': ['Chemist', 'Statistician'], |
| | + | 'Born': ['1920-07-25', '1876-06-13'], |
| | + | 'Died': ['1958-04-16', '1937-10-16'], |
| | + | }) |
| | + | </syntaxhighlight>If you want to keep the order of columns:<syntaxhighlight lang="python3"> |
| | + | scientists = pd.DataFrame({ |
| | + | 'Occupation': ['Chemist', 'Statistician'], |
| | + | 'Born': ['1920-07-25', '1876-06-13'], |
| | + | 'Died': ['1958-04-16', '1937-10-16'], |
| | + | }, index=['Rosaline Franklin', 'William Gosset'], columns=['Occupation', 'Born', 'Died']) |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===From CSV=== |
| | <syntaxhighlight lang="python3"> | | <syntaxhighlight lang="python3"> |
| | news = pd.read_csv('news_2019.05.10.csv') | | news = pd.read_csv('news_2019.05.10.csv') |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | | | |
| − | ==Dataframe== | + | ===Info=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.index |
| | + | df.columns |
| | + | df.values |
| | + | df.shape |
| | + | df.dtypes |
| | + | df.head() |
| | + | df.tail() |
| | + | df.info() |
| | + | df.describe() |
| | + | </syntaxhighlight> |
| | | | |
| | ===Select 1 row=== | | ===Select 1 row=== |
| | <syntaxhighlight lang="python3"> | | <syntaxhighlight lang="python3"> |
| − | texts.iloc[[1]]
| + | df.iloc[[1]] # for positional indexing |
| | + | df.loc[0] # for label based |
| | + | df.iloc[-1] == df.loc[df.shape[0]-1] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select specific rows=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.loc[[9, 99, 999]] |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | | | |
| | ===Select 1 column=== | | ===Select 1 column=== |
| | <syntaxhighlight lang="python3"> | | <syntaxhighlight lang="python3"> |
| − | sumarys = news[['summary']] | + | sumarys = df[['summary']] |
| | # Or | | # Or |
| | + | list(df['one']) |
| | dfToList = df['one'].tolist() | | dfToList = df['one'].tolist() |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Select multiple columns=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df[['column1', 'column2', 'column3']] |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | | | |
| | ===Select 1 cell=== | | ===Select 1 cell=== |
| | <syntaxhighlight lang="python3"> | | <syntaxhighlight lang="python3"> |
| − | texts.iloc[1][1]
| + | df.iloc[1][1] |
| − | # Or
| + | df.iloc[1]['summary'] |
| − | texts.iloc[1]['summary']
| + | df.iloc[1, 3] |
| | + | |
| | + | df.loc[1, 'summary'] |
| | + | </syntaxhighlight> |
| | + | |
| | + | ===Subset multiple rows and multiple columns=== |
| | + | <syntaxhighlight lang="python3"> |
| | + | df.iloc[[1,34,56],[2,4,5]] |
| | + | df.loc[[1,34,56],['modification_date', 'content']] |
| | </syntaxhighlight> | | </syntaxhighlight> |
| | [[Category:Python]] | | [[Category:Python]] |
| | + | [[Category:DataScience]] |