Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
software_development:python_pandas [2022/08/04 14:39] prgram |
software_development:python_pandas [2023/05/16 15:17] (current) prgram [encoding_errors - 'ignore'] |
||
---|---|---|---|
Line 2: | Line 2: | ||
{{INLINETOC}} | {{INLINETOC}} | ||
- | === Shapes === | + | === etc : list === |
<code python> | <code python> | ||
- | df.shape | + | set( [list] ) # unique value |
- | df.describe() | + | [list].sort() #자동적용? |
- | df.info() | + | [list1] + [list2] #list합치기 |
</code> | </code> | ||
- | #use the dataframe.nunique() function to find the unique values | + | ===== shape of df ===== |
- | unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique']) | + | === Pivot_table === |
- | + | ||
- | df.loc[df.sex=='W', 'year_born'].hist() | + | |
- | + | ||
- | === Selects === | + | |
<code python> | <code python> | ||
- | iloc: Select by position | + | df.pivot_table(index=[인덱스컬럼], |
- | loc: Select by label | + | columns=[컬럼1,컬럼2], |
+ | values=[값], | ||
+ | aggfunc='sum').reset_index() | ||
+ | </code> | ||
+ | * string일 때는 aggfunc='max' | ||
+ | * index에 NULL 있으면 안됨 | ||
+ | == fillna == | ||
+ | <code python> | ||
+ | df[['a','b','c']] = df[['a','b','c']].fillna('') | ||
</code> | </code> | ||
+ | === group by === | ||
+ | <code python> | ||
+ | df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index() | ||
- | + | df.groupby([COLUMNS])['COLUMN'].max().reset_index() | |
- | + | ||
- | ==I/O file== | + | |
+ | df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates() | ||
- | === chunk === | + | df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드') |
- | <code python> | + | df.columns = df.columns.droplevel() |
- | for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1): | + | |
- | chunk.memory_usage() | + | |
</code> | </code> | ||
+ | === rank === | ||
<code python> | <code python> | ||
- | df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines | + | df['rank'] = df.groupby('code')['value'].rank(ascending=False) |
- | for chunk in df_chunk: | + | |
- | # perform data filtering | + | |
- | chunk_filter = chunk_preprocessing(chunk) | + | |
</code> | </code> | ||
+ | === merge === | ||
<code python> | <code python> | ||
- | df3=pd.read_csv("abc.txt", skiprows=1, header=None, | + | df_out = df_out.merge(df, on=['no', 'name'], how='outer') #left_on right_on |
- | sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False) | + | |
- | + | ||
- | with open("abc.txt", encoding="utf-16") as f: | + | |
- | i=0 | + | |
- | for line in f: | + | |
- | i=i+1 | + | |
- | print(line) | + | |
- | if i >3 : break | + | |
- | </code> | + | |
- | + | ||
- | + | ||
- | file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/ | + | |
- | === row iteration === | + | |
- | <code python> | + | |
- | for idx,row in anime[:2].iterrows(): | + | |
- | print(idx, row) | + | |
</code> | </code> | ||
+ | ===== modify ===== | ||
=== Series to DF === | === Series to DF === | ||
<code python> | <code python> | ||
df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True) | df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True) | ||
</code> | </code> | ||
+ | |||
+ | |||
=== rename === | === rename === | ||
<code python> | <code python> | ||
Line 72: | Line 62: | ||
df.columns = ['1'] + df.columns[1:].tolist() | df.columns = ['1'] + df.columns[1:].tolist() | ||
</code> | </code> | ||
+ | |||
+ | === order of columns === | ||
+ | <code python> | ||
+ | #1 | ||
+ | df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no') | ||
+ | #2 | ||
+ | df.columns | ||
+ | col_order = ['a','b','c'] | ||
+ | df = df.reindex(col_order, axis='columns') | ||
+ | </code> | ||
+ | |||
=== map === | === map === | ||
Line 77: | Line 78: | ||
df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로 | df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로 | ||
</code> | </code> | ||
- | === Pivot_table === | + | |
+ | |||
+ | ===== get info ===== | ||
+ | |||
+ | === Shapes === | ||
<code python> | <code python> | ||
- | df.pivot_table(index=[인덱스컬럼], | + | df.shape |
- | columns=[컬럼1,컬럼2], | + | df.describe() |
- | values=[값], | + | df.info() |
- | aggfunc='sum').reset_index() | + | |
</code> | </code> | ||
- | === group by === | + | #use the dataframe.nunique() function to find the unique values |
+ | unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique']) | ||
+ | |||
+ | df.loc[df.sex=='W', 'year_born'].hist() | ||
+ | |||
+ | === type === | ||
<code python> | <code python> | ||
- | df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index() | + | df['value'] = pd.to_numeric(df['value'], errors='coerce') |
+ | df['value'].dtype | ||
+ | </code> | ||
- | df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates() | ||
- | df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드') | + | ===== get element ===== |
- | df.columns = df.columns.droplevel() | + | |
+ | === Selects === | ||
+ | <code python> | ||
+ | iloc: Select by position | ||
+ | loc: Select by label | ||
+ | |||
+ | df.loc[:,~df.columns.isin(['a','b'])] | ||
+ | |||
+ | df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )] #row-wise | ||
+ | df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8] #row-wise & column | ||
</code> | </code> | ||
- | === merge === | + | === row iteration === |
<code python> | <code python> | ||
- | df_out = df_out.merge(df, on=['no', 'name'], how='outer') | + | for idx,row in anime[:2].iterrows(): |
+ | print(idx, row) | ||
</code> | </code> | ||
- | === rank === | + | |
+ | |||
+ | =====I/O file===== | ||
+ | |||
+ | === encoding_errors - 'ignore'=== | ||
+ | Encoding 제대로 했는데도 안되면.. | ||
+ | 공공데이터가 이런 경우가 많음. | ||
+ | |||
+ | Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러 | ||
+ | https://con2joa.tistory.com/m/60 | ||
+ | quoting=csv.QUOTE_NONE 파라미터 | ||
<code python> | <code python> | ||
- | df['rank'] = df.groupby('code')['value'].rank(ascending=False) | + | import chardet |
+ | with open(file, 'rb') as rawdata: | ||
+ | result = chardet.detect(rawdata.read(100000)) | ||
+ | result | ||
+ | |||
+ | |||
+ | data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore') | ||
+ | # on_bad_lines='skip' | ||
+ | # error_bad_lines=False | ||
</code> | </code> | ||
+ | |||
=== to_numberic === | === to_numberic === | ||
<code python> | <code python> | ||
Line 120: | Line 160: | ||
</code> | </code> | ||
+ | === chunk === | ||
+ | <code python> | ||
+ | for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1): | ||
+ | chunk.memory_usage() | ||
+ | </code> | ||
- | === type === | ||
<code python> | <code python> | ||
- | df['value'] = pd.to_numeric(df['value'], errors='coerce') | + | df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines |
- | df['value'].dtype | + | for chunk in df_chunk: |
+ | # perform data filtering | ||
+ | chunk_filter = chunk_preprocessing(chunk) | ||
</code> | </code> | ||
+ | |||
+ | <code python> | ||
+ | df3=pd.read_csv("abc.txt", skiprows=1, header=None, | ||
+ | sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False) | ||
+ | |||
+ | with open("abc.txt", encoding="utf-16") as f: | ||
+ | i=0 | ||
+ | for line in f: | ||
+ | i=i+1 | ||
+ | print(line) | ||
+ | if i >3 : break | ||
+ | </code> | ||
+ | | ||
+ | | ||
+ | file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/ | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
{{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}} | {{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}} |