由於在測系統 performance 時, 發現"open" 這個system call 花的時間會比其他system call多, 所以trace了一下整個 open syscall 的流程.
1. open 進入kernel後,會通過一系列轉換,最終會呼叫 fs/open.c 的macro function.
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); }
2. 第一步先檢查系統是 32 or 64 bit, 若為64 bit則 enable force_o_largefile
#ifndef force_o_largefile #define force_o_largefile() (BITS_PER_LONG != 32) #endif
3. 接著 fs/open.c 下的 do_sys_open進行後續處理. 首先先透過build_open_flag()找尋dentry & inode.
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; int fd = build_open_flags(flags, mode, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(flags); if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fsnotify_open(f); fd_install(fd, f); } } putname(tmp); return fd; }
4. 接著進入getname (fs/namei.c), 沒問題則繼續往下 allocate kernel memory.
struct filename * getname(const char __user * filename) { return getname_flags(filename, 0, NULL); }
struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } result->refcnt = 1; /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result); return ERR_PTR(-ENOENT); } } result->uptr = filename; result->aname = NULL; audit_getname(result); return result; }
include/linux/fs.h
#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
5. 若檔案存在並且allocate memory成功, 則執行get_unused_fd_flags 來拿到一個可用的file description
fs/file.c
int get_unused_fd_flags(unsigned flags) { return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); }
6. 拿到fd後, 使用do_filp_open()開啟.
fs/namei.c
struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(dfd, pathname, &nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); return filp; }
7. 在do_filp_open()中呼叫path_openat. path_openat中前幾個是初始化的function. 最後執行do_last() function.
fs/namei.c
static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { struct file *file; struct path path; int opened = 0; int error; file = get_empty_filp(); if (IS_ERR(file)) return file; file->f_flags = op->open_flag; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); goto out2; } error = path_init(dfd, pathname, flags, nd); if (unlikely(error)) goto out; error = do_last(nd, &path, file, op, &opened, pathname); while (unlikely(error > 0)) { /* trailing symlink */ struct path link = path; void *cookie; if (!(nd->flags & LOOKUP_FOLLOW)) { path_put_conditional(&path, nd); path_put(&nd->path); error = -ELOOP; break; } error = may_follow_link(&link, nd); if (unlikely(error)) break; nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); if (unlikely(error)) break; error = do_last(nd, &path, file, op, &opened, pathname); put_link(nd, &link, cookie); } out: path_cleanup(nd); out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); } if (unlikely(error)) { if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } file = ERR_PTR(error); } return file; }
8. 若不是CREATE flag (即文件已存在), 則finish_open()使用nameidata_to_filp函數將文件打開, 並反回do_sys_open()
fs/namei.c
</h3> <h3>static int do_last(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, int *opened, struct filename *name) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; struct inode *inode; bool symlink_ok = false; struct path save_parent = { .dentry = NULL, .mnt = NULL }; bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (error) return error; goto finish_open; } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = true; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, path, &inode); if (likely(!error)) goto finish_lookup; if (error < 0) goto out; BUG_ON(nd->inode != dir->d_inode); } else { /* create side of things */ /* * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED * has been cleared when we got to the last component we are * about to look up */ error = complete_walk(nd); if (error) return error; audit_inode(name, dir, LOOKUP_PARENT); error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto out; } retry_lookup: if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } mutex_lock(&dir->d_inode->i_mutex); error = lookup_open(nd, path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { if (error) goto out; if ((*opened & FILE_CREATED) || !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(name, file->f_path.dentry, 0); goto opened; } if (*opened & FILE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = MAY_OPEN; path_to_nameidata(path, nd); goto finish_open_created; } /* * create/update audit record if it already exists. */ if (d_is_positive(path->dentry)) audit_inode(name, path->dentry, 0); /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ if (got_write) { mnt_drop_write(nd->path.mnt); got_write = false; } error = -EEXIST; if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) goto exit_dput; error = follow_managed(path, nd->flags); if (error < 0) goto exit_dput; if (error) nd->flags |= LOOKUP_JUMPED; BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; error = -ENOENT; if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } finish_lookup: /* we _can_ be in RCU mode here */ if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(nd->path.mnt != path->mnt || unlazy_walk(nd, path->dentry))) { error = -ECHILD; goto out; } } BUG_ON(inode != path->dentry->d_inode); return 1; } if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { path_to_nameidata(path, nd); } else { save_parent.dentry = nd->path.dentry; save_parent.mnt = mntget(path->mnt); nd->path.dentry = path->dentry; } nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); if (error) { path_put(&save_parent); return error; } audit_inode(name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) goto out; got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (!error) { *opened |= FILE_OPENED; } else { if (error == -EOPENSTALE) goto stale_open; goto out; } opened: error = open_check_o_direct(file); if (error) goto exit_fput; error = ima_file_check(file, op->acc_mode, *opened); if (error) goto exit_fput; if (will_truncate) { error = handle_truncate(file); if (error) goto exit_fput; } out: if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); terminate_walk(nd); return error; exit_dput: path_put_conditional(path, nd); goto out; exit_fput: fput(file); goto out; stale_open: /* If no saved parent or already retried then can't retry */ if (!save_parent.dentry || retried) goto out; BUG_ON(save_parent.dentry != dir); path_put(&nd->path); nd->path = save_parent; nd->inode = dir->d_inode; save_parent.mnt = NULL; save_parent.dentry = NULL; if (got_write) { mnt_drop_write(nd->path.mnt); got_write = false; } retried = true; goto retry_lookup; }
9. 有了filp & fd之後, do_sys_open() 接著使用fsnotify_open()將此flie pointer加入監控.
include/linux/fsnotify.h
static inline void fsnotify_open(struct file *file) { struct path *path = &file->f_path; struct inode *inode = file_inode(file); __u32 mask = FS_OPEN; if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; fsnotify_parent(path, NULL, mask); fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0); }
10. 最後則是在do_sys_open()使用fd_install()將fd和file pointer 做mapping.
並return fd 回去
fs/file.c
void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); } void fd_install(unsigned int fd, struct file *file) { __fd_install(current->files, fd, file); }
心得:
經由這一系列的trace, 發現原來open 花的時間多的原因是: 其他system call (read,write, etc..)
測試時使用現成的fd, 而沒有像open需要重新allocate一個fd.
光執行search dentry/ inode/ fd table/ fd mapping 就需花不少時間.