Linux kernel

在Kernel 4.1 Trace Open syscall 筆記

由於在測系統 performance 時, 發現”open” 這個system call 花的時間會比其他system call多, 所以trace了一下整個 open syscall 的流程.

1. open 進入kernel後,會通過一系列轉換,最終會呼叫 fs/open.c macro function.

[code language=”cpp”]
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;

return do_sys_open(AT_FDCWD, filename, flags, mode);
}
[/code]

2. 第一步先檢查系統是 32 or 64 bit, 若為64 bit則 enable force_o_largefile

[code language=”bash”]
#ifndef force_o_largefile
#define force_o_largefile() (BITS_PER_LONG != 32)
#endif
[/code]

 

3. 接著 fs/open.c 下的 do_sys_open進行後續處理. 首先先透過build_open_flag()找尋dentry & inode.

[code language=”cpp”]
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;

if (fd)
return fd;

tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);

fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
[/code]

4. 接著進入getname (fs/namei.c), 沒問題則繼續往下 allocate kernel memory.

[code language=”cpp”]

struct filename * getname(const char __user * filename)

{ return getname_flags(filename, 0, NULL); }

[/code]

 

[code language=”cpp”]
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result;
char *kname;
int len;

result = audit_reusename(filename);
if (result)
return result;

result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);

/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
kname = (char *)result->iname;
result->name = kname;

len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
__putname(result);
return ERR_PTR(len);
}

/*
* Uh-oh. We have a name that’s approaching PATH_MAX. Allocate a
* separate struct filename so we can dedicate the entire
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
if (unlikely(len == EMBEDDED_NAME_MAX)) {
const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;

/*
* size is chosen that way we to guarantee that
* result->iname[0] is within the same object and that
* kname can’t be equal to result->iname, no matter what.
*/
result = kzalloc(size, GFP_KERNEL);
if (unlikely(!result)) {
__putname(kname);
return ERR_PTR(-ENOMEM);
}
result->name = kname;
len = strncpy_from_user(kname, filename, PATH_MAX);
if (unlikely(len < 0)) {
__putname(kname);
kfree(result);
return ERR_PTR(len);
}
if (unlikely(len == PATH_MAX)) {
__putname(kname);
kfree(result);
return ERR_PTR(-ENAMETOOLONG);
}
}

result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
if (!(flags & LOOKUP_EMPTY)) {
putname(result);
return ERR_PTR(-ENOENT);
}
}

result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
}

[/code]
include/linux/fs.h

[code language=”bash”]
#define __getname()             kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)         kmem_cache_free(names_cachep, (void *)(name))
[/code]

5. 若檔案存在並且allocate memory成功, 則執行get_unused_fd_flags 來拿到一個可用的file description

fs/file.c

[code language=”cpp”]
int get_unused_fd_flags(unsigned flags)
{
return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
[/code]

6. 拿到fd, 使用do_filp_open()開啟.

fs/namei.c

[code language=”cpp”]

struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;

filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(dfd, pathname, &nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
return filp;
}

[/code]

7. 在do_filp_open()中呼叫path_openat. path_openat中前幾個是初始化的function. 最後執行do_last() function.

fs/namei.c

[code language=”cpp”]
static struct file *path_openat(int dfd, struct filename *pathname,
struct nameidata *nd, const struct open_flags *op, int flags)
{
struct file *file;
struct path path;
int opened = 0;
int error;

file = get_empty_filp();
if (IS_ERR(file))
return file;

file->f_flags = op->open_flag;

if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
goto out2;
}

error = path_init(dfd, pathname, flags, nd);
if (unlikely(error))
goto out;

error = do_last(nd, &path, file, op, &opened, pathname);
while (unlikely(error > 0)) { /* trailing symlink */
struct path link = path;
void *cookie;
if (!(nd->flags & LOOKUP_FOLLOW)) {
path_put_conditional(&path, nd);
path_put(&nd->path);
error = -ELOOP;
break;
}
error = may_follow_link(&link, nd);
if (unlikely(error))
break;
nd->flags |= LOOKUP_PARENT;
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
error = follow_link(&link, nd, &cookie);
if (unlikely(error))
break;
error = do_last(nd, &path, file, op, &opened, pathname);
put_link(nd, &link, cookie);
}
out:
path_cleanup(nd);
out2:
if (!(opened & FILE_OPENED)) {
BUG_ON(!error);
put_filp(file);
}
if (unlikely(error)) {
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
file = ERR_PTR(error);
}
return file;
}

[/code]

8. 若不是CREATE flag (即文件已存在), finish_open()使用nameidata_to_filp函數將文件打開, 並反回do_sys_open()

fs/namei.c

[code language=”cpp”]</h3>
<h3>static int do_last(struct nameidata *nd, struct path *path,
struct file *file, const struct open_flags *op,
int *opened, struct filename *name)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
int acc_mode = op->acc_mode;
struct inode *inode;
bool symlink_ok = false;
struct path save_parent = { .dentry = NULL, .mnt = NULL };
bool retried = false;
int error;

nd->flags &= ~LOOKUP_PARENT;
nd->flags |= op->intent;

if (nd->last_type != LAST_NORM) {
error = handle_dots(nd, nd->last_type);
if (error)
return error;
goto finish_open;
}

if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
symlink_ok = true;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, path, &inode);
if (likely(!error))
goto finish_lookup;

if (error < 0)
goto out;

BUG_ON(nd->inode != dir->d_inode);
} else {
/* create side of things */
/*
* This will *only* deal with leaving RCU mode – LOOKUP_JUMPED
* has been cleared when we got to the last component we are
* about to look up
*/
error = complete_walk(nd);
if (error)
return error;

audit_inode(name, dir, LOOKUP_PARENT);
error = -EISDIR;
/* trailing slashes? */
if (nd->last.name[nd->last.len])
goto out;
}

retry_lookup:
if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = true;
/*
* do _not_ fail yet – we might not need that or fail with
* a different error; let lookup_open() decide; we’ll be
* dropping this one anyway.
*/
}
mutex_lock(&dir->d_inode->i_mutex);
error = lookup_open(nd, path, file, op, got_write, opened);
mutex_unlock(&dir->d_inode->i_mutex);

if (error <= 0) {
if (error)
goto out;

if ((*opened & FILE_CREATED) ||
!S_ISREG(file_inode(file)->i_mode))
will_truncate = false;

audit_inode(name, file->f_path.dentry, 0);
goto opened;
}

if (*opened & FILE_CREATED) {
/* Don’t check for write permission, don’t truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
acc_mode = MAY_OPEN;
path_to_nameidata(path, nd);
goto finish_open_created;
}

/*
* create/update audit record if it already exists.
*/
if (d_is_positive(path->dentry))
audit_inode(name, path->dentry, 0);

/*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary…)
*/
if (got_write) {
mnt_drop_write(nd->path.mnt);
got_write = false;
}

error = -EEXIST;
if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
goto exit_dput;

error = follow_managed(path, nd->flags);
if (error < 0)
goto exit_dput;

if (error)
nd->flags |= LOOKUP_JUMPED;

BUG_ON(nd->flags & LOOKUP_RCU);
inode = path->dentry->d_inode;
error = -ENOENT;
if (d_is_negative(path->dentry)) {
path_to_nameidata(path, nd);
goto out;
}
finish_lookup:
/* we _can_ be in RCU mode here */
if (should_follow_link(path->dentry, !symlink_ok)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(nd->path.mnt != path->mnt ||
unlazy_walk(nd, path->dentry))) {
error = -ECHILD;
goto out;
}
}
BUG_ON(inode != path->dentry->d_inode);
return 1;
}

if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
path_to_nameidata(path, nd);
} else {
save_parent.dentry = nd->path.dentry;
save_parent.mnt = mntget(path->mnt);
nd->path.dentry = path->dentry;

}
nd->inode = inode;
/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED… */
finish_open:
error = complete_walk(nd);
if (error) {
path_put(&save_parent);
return error;
}
audit_inode(name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
if (!d_is_reg(nd->path.dentry))
will_truncate = false;

if (will_truncate) {
error = mnt_want_write(nd->path.mnt);
if (error)
goto out;
got_write = true;
}
finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;

BUG_ON(*opened & FILE_OPENED); /* once it’s opened, it’s opened */
error = vfs_open(&nd->path, file, current_cred());
if (!error) {
*opened |= FILE_OPENED;
} else {
if (error == -EOPENSTALE)
goto stale_open;
goto out;
}
opened:
error = open_check_o_direct(file);
if (error)
goto exit_fput;
error = ima_file_check(file, op->acc_mode, *opened);
if (error)
goto exit_fput;

if (will_truncate) {
error = handle_truncate(file);
if (error)
goto exit_fput;
}
out:
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
terminate_walk(nd);
return error;

exit_dput:
path_put_conditional(path, nd);
goto out;
exit_fput:
fput(file);
goto out;

stale_open:
/* If no saved parent or already retried then can’t retry */
if (!save_parent.dentry || retried)
goto out;

BUG_ON(save_parent.dentry != dir);
path_put(&nd->path);
nd->path = save_parent;
nd->inode = dir->d_inode;
save_parent.mnt = NULL;
save_parent.dentry = NULL;
if (got_write) {
mnt_drop_write(nd->path.mnt);
got_write = false;
}
retried = true;
goto retry_lookup;
}
[/code]

9. 有了filp & fd之後, do_sys_open() 接著使用fsnotify_open()將此flie pointer加入監控.

include/linux/fsnotify.h

[code language=”cpp”]
static inline void fsnotify_open(struct file *file)
{
struct path *path = &file->f_path;
struct inode *inode = file_inode(file);
__u32 mask = FS_OPEN;

if (S_ISDIR(inode->i_mode))
mask |= FS_ISDIR;

fsnotify_parent(path, NULL, mask);
fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
}
[/code]

10. 最後則是在do_sys_open()使用fd_install()fdfile pointer mapping.
並return fd 回去

fs/file.c

[code language=”cpp”]
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
}

void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
[/code]

心得:

經由這一系列的trace, 發現原來open 花的時間多的原因是: 其他system call (read,write, etc..)

測試時使用現成的fd, 而沒有像open需要重新allocate一個fd.

光執行search dentry/ inode/ fd table/ fd mapping 就需花不少時間.

發表迴響