Difference between revisions of "Pandas"

From RHS Wiki
Jump to navigation Jump to search
Tag: visualeditor
m
Tag: visualeditor
 
(9 intermediate revisions by the same user not shown)
Line 7: Line 7:
 
<syntaxhighlight lang="python3">
 
<syntaxhighlight lang="python3">
 
df = pd.read_csv('news_2019.05.10.csv')
 
df = pd.read_csv('news_2019.05.10.csv')
 +
</syntaxhighlight>
 +
 +
==DataSeries==
 +
<syntaxhighlight lang="python3">
 +
s = pd.Series(['banana', 42])
 +
s = pd.Series(['banana', 42], index=['Fruit', 'Calories'])
 +
s.values
 +
s.keys()
 +
s.values[0]
 +
s.keys()[0]
 +
s.min()
 +
s.max()
 +
s.std()
 
</syntaxhighlight>
 
</syntaxhighlight>
  
 
==Dataframe==
 
==Dataframe==
 +
 +
===Create===
 +
<syntaxhighlight lang="python3">
 +
scientists = pd.DataFrame({
 +
    'Name': ['Rosaline Franklin', 'William Gosset'],
 +
    'Occupation': ['Chemist', 'Statistician'],
 +
    'Born': ['1920-07-25', '1876-06-13'],
 +
    'Died': ['1958-04-16', '1937-10-16'],
 +
})
 +
</syntaxhighlight>If you want to keep the order of columns:<syntaxhighlight lang="python3">
 +
scientists = pd.DataFrame({
 +
    'Occupation': ['Chemist', 'Statistician'],
 +
    'Born': ['1920-07-25', '1876-06-13'],
 +
    'Died': ['1958-04-16', '1937-10-16'],
 +
}, index=['Rosaline Franklin', 'William Gosset'], columns=['Occupation', 'Born', 'Died'])
 +
</syntaxhighlight>
 +
 +
===From CSV===
 +
<syntaxhighlight lang="python3">
 +
news = pd.read_csv('news_2019.05.10.csv')
 +
</syntaxhighlight>
  
 
===Info===
 
===Info===
Line 21: Line 55:
 
df.tail()
 
df.tail()
 
df.info()
 
df.info()
 +
df.describe()
 
</syntaxhighlight>
 
</syntaxhighlight>
  
 
===Select 1 row===
 
===Select 1 row===
 
<syntaxhighlight lang="python3">
 
<syntaxhighlight lang="python3">
df.iloc[[1]]
+
df.iloc[[1]] # for positional indexing
 +
df.loc[0]    # for label based
 +
df.iloc[-1] == df.loc[df.shape[0]-1]
 +
</syntaxhighlight>
 +
 
 +
===Select specific rows===
 +
<syntaxhighlight lang="python3">
 +
df.loc[[9, 99, 999]]
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 36: Line 78:
 
</syntaxhighlight>
 
</syntaxhighlight>
  
=== Select multiple columns ===
+
===Select multiple columns===
 
<syntaxhighlight lang="python3">
 
<syntaxhighlight lang="python3">
 
df[['column1', 'column2', 'column3']]
 
df[['column1', 'column2', 'column3']]
Line 44: Line 86:
 
<syntaxhighlight lang="python3">
 
<syntaxhighlight lang="python3">
 
df.iloc[1][1]
 
df.iloc[1][1]
# Or
 
 
df.iloc[1]['summary']
 
df.iloc[1]['summary']
 +
df.iloc[1, 3]
 +
 +
df.loc[1, 'summary']
 +
</syntaxhighlight>
 +
 +
===Subset multiple rows and multiple columns===
 +
<syntaxhighlight lang="python3">
 +
df.iloc[[1,34,56],[2,4,5]]
 +
df.loc[[1,34,56],['modification_date', 'content']]
 
</syntaxhighlight>
 
</syntaxhighlight>
 
[[Category:Python]]
 
[[Category:Python]]
 +
[[Category:DataScience]]

Latest revision as of 21:28, 12 February 2022

Install[edit]

pip install pandas

Read CSV[edit]

df = pd.read_csv('news_2019.05.10.csv')

DataSeries[edit]

s = pd.Series(['banana', 42])
s = pd.Series(['banana', 42], index=['Fruit', 'Calories'])
s.values
s.keys()
s.values[0]
s.keys()[0]
s.min()
s.max()
s.std()

Dataframe[edit]

Create[edit]

scientists = pd.DataFrame({
    'Name': ['Rosaline Franklin', 'William Gosset'],
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
})

If you want to keep the order of columns:

scientists = pd.DataFrame({
    'Occupation': ['Chemist', 'Statistician'],
    'Born': ['1920-07-25', '1876-06-13'],
    'Died': ['1958-04-16', '1937-10-16'],
}, index=['Rosaline Franklin', 'William Gosset'], columns=['Occupation', 'Born', 'Died'])

From CSV[edit]

news = pd.read_csv('news_2019.05.10.csv')

Info[edit]

df.index
df.columns
df.values
df.shape
df.dtypes
df.head()
df.tail()
df.info()
df.describe()

Select 1 row[edit]

df.iloc[[1]] # for positional indexing
df.loc[0]    # for label based 
df.iloc[-1] == df.loc[df.shape[0]-1]

Select specific rows[edit]

df.loc[[9, 99, 999]]

Select 1 column[edit]

sumarys = df[['summary']]
# Or
list(df['one'])
dfToList = df['one'].tolist()

Select multiple columns[edit]

df[['column1', 'column2', 'column3']]

Select 1 cell[edit]

df.iloc[1][1]
df.iloc[1]['summary']
df.iloc[1, 3]

df.loc[1, 'summary']

Subset multiple rows and multiple columns[edit]

df.iloc[[1,34,56],[2,4,5]]
df.loc[[1,34,56],['modification_date', 'content']]