锘??xml version="1.0" encoding="utf-8" standalone="yes"?>
def list_get(file):
soup = BeautifulSoup(open(file))
alist = soup.find_all('a',class_ = 'link')
list = []
for i in alist:
list.append(i.get('href'));
#for i in list:
# print(i)
return list
if __name__=="__main__":
list_get('List.htm')
list_get鍑芥暟榪斿洖鐨勬槸list瀛楃涓插璞?br />鍏禼璇█璋冪敤鐨勪唬鐮佸涓?
#include <stdlib.h>
#include <Python.h>
char* GDALPythonObjectToCStr(PyObject* pyObject);
int main(int argc, char *argv[])
{
Py_Initialize();
if(!Py_IsInitialized())
{
return -1;
}
PyRun_SimpleString("import sys");
PyRun_SimpleString("sys.path.append('./script')");
PyObject* pModule;
PyObject* pDict;
PyObject* pFunc;
pModule = PyImport_ImportModule("list");
if(!pModule)
{
printf("can't find list.py");
system("PAUSE");
getchar();
return -1;
}
pDict = PyModule_GetDict(pModule);
if(!pDict)
{
return -1;
}
pFunc = PyDict_GetItemString(pDict,"list_get");
if(!pFunc || !PyCallable_Check(pFunc))
{
printf("can't find function [list_get]");
getchar();
return -1;
}
PyObject* args = PyTuple_New(1);
PyTuple_SetItem(args,0,Py_BuildValue("s","List.htm"));
PyObject* value = PyObject_CallObject(pFunc,args);
int ret = PySequence_Check(value);
printf("check:%d\n",ret);
int length = PySequence_Size(value);
printf("length:%d\n",length);
int i = 0;
for(;i<length;i++)
{
PyObject* obj = PySequence_GetItem(value,i);
//char* str = PyBytes_AS_STRING(obj);
char* str = GDALPythonObjectToCStr(obj);
printf("link:%s\n",str);
free(str);
}
Py_DECREF(args);
Py_DECREF(pModule);
Py_Finalize();
system("PAUSE");
return 0;
}
/* Return a NULL terminated c String from a PyObject */
/* Result must be freed with GDALPythonFreeCStr */
char* GDALPythonObjectToCStr(PyObject* pyObject)
{
#if PY_VERSION_HEX >= 0x03000000
if(PyUnicode_Check(pyObject))
{
char *pszStr;
char *pszNewStr;
Py_ssize_t nLen;
PyObject* pyUTF8Str = PyUnicode_AsUTF8String(pyObject);
PyBytes_AsStringAndSize(pyUTF8Str,&pszStr,&nLen);
pszNewStr = (char*)malloc(nLen+1);
memcpy(pszNewStr,pszStr,nLen+1);
Py_XDECREF(pyUTF8Str);
return pszNewStr;
}
else if(PyBytes_Check(pyObject))
{
char *pszStr;
char *pszNewStr;
Py_ssize_t nLen;
PyBytes_AsStringAndSize(pyObject,&pszStr,&nLen);
pszNewStr = (char*)malloc(nLen+1);
memcpy(pszNewStr,pszStr,nLen+1);
return pszNewStr;
}
else
{
char *pszStr = (char*)malloc(1);
pszStr[0] = '\0';
return pszStr;
}
#else
return PyString_AsString(pyObject);
#endif
}
]]>
浣跨敤Beautiful Soup鐨勪竴涓緥瀛愬涓?
soup = BeautifulSoup(open('List.htm'))
for a in soup.find_all('a',class_ = 'link'):
print (a.get('href'))
濡傛灉鏄嬌鐢╟++ libtidy鐨勮瘽
瀵瑰簲鐨勪唬鐮佸涓?
{
return no;
}
void extractContent(TidyNode node,TidyDoc doc);
void parseContent(TidyNode node,TidyDoc doc)
{
TidyNode child;
for(child = tidyGetChild(node);child;child = tidyGetNext(child))
{
if(tidyNodeIsA(child))
extractContent(child,doc);
else
parseContent(child,doc);
}
}
void extractContent(TidyNode node,TidyDoc doc)
{
if(yes == tidyNodeIsA(node))
{
TidyAttr cls = tidyAttrGetCLASS(node);
if(cls != NULL)
{
char* value = (char*)tidyAttrValue(cls);
if(!strcmp(value,"link"))
{
TidyAttr href = tidyAttrGetHREF(node);
if(href != NULL)
{
char* link = (char*)tidyAttrValue(href);
printf("link:%s\n",link);
return;
}
}
}
}
parseContent(node,doc);
}
void tidyParseHtml(char* file)
{
TidyDoc doc = tidyCreate();
tidySetReportFilter(doc,tidyFilterCb);
tidyParseFile(doc,file);
TidyNode body = tidyGetBody(doc);
TidyNode child;
for(child = tidyGetChild(body);child;child = tidyGetNext(child))
{
parseContent(child,doc);
}
tidyRelease(doc);
}
褰撶劧涓嬮潰鐨刾ython浠g爜涔熻兘瀹屾垚浠誨姟:
soup = BeautifulSoup(open('List.htm'))
list = soup.select('a[class="link"]')
for a in list:
if a.has_attr('href'):
print (a.get('href'))
]]>